Ejemplo n.º 1
0
    def lstm(self,
             data,
             markers,
             lstm_static=None,
             lstm_hidden=1000,
             weight_filler=None,
             bias_filler=None,
             learning_param_lstm=None):

        #default params
        if not weight_filler:
            weight_filler = self.uniform_weight_filler(-.08, .08)
        if not bias_filler: bias_filler = self.constant_filler(0)
        if not learning_param_lstm:
            learning_param_lstm = self.init_params([[1, 1], [1, 1], [1, 1]])

        if lstm_static:
            return L.LSTM(data,
                          markers,
                          lstm_static,
                          param=learning_param_lstm,
                          recurrent_param=dict(num_output=lstm_hidden,
                                               weight_filler=weight_filler,
                                               bias_filler=bias_filler))
        else:
            return L.LSTM(data,
                          markers,
                          param=learning_param_lstm,
                          recurrent_param=dict(num_output=lstm_hidden,
                                               weight_filler=weight_filler,
                                               bias_filler=bias_filler))
Ejemplo n.º 2
0
def PlateNetBody(net, data_layer, time_step, num_classes):
    #  lstm_kwargs = {
    #   'weight_filler': dict(type='xavier'),
    #  'bias_filler': dict(type='constant', value=0)}

    kwargs = {
        'param':
        [dict(lr_mult=1, decay_mult=1),
         dict(lr_mult=2, decay_mult=0)],
        'weight_filler': dict(type='xavier'),
        'bias_filler': dict(type='constant', value=0)
    }

    #  assert from_layer in net.keys()  # 48 x 48

    recurrent_param = {
        'num_output': 100,
        'weight_filler': dict(type='xavier'),
        'bias_filler': dict(type='constant', value=0)
    }
    net.indicator = L.ContinuationIndicator(time_step=time_step,
                                            batch_size=512)
    net.permuted_data = L.Permute(data_layer, order=[3, 0, 1, 2])
    net.lstm1 = L.LSTM(net.permuted_data,
                       net.indicator,
                       recurrent_param=recurrent_param)
    net.lstm2 = L.LSTM(net.lstm1,
                       net.indicator,
                       recurrent_param=recurrent_param)
    net.fc1 = L.InnerProduct(net.lstm2,
                             num_output=num_classes + 1,
                             axis=2,
                             **kwargs)

    return net
Ejemplo n.º 3
0
def exp_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.exp_att_feature, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \
        L.Python(module='exp_data_provider_layer', layer='ExpDataProviderLayer', param_str=mode_str, ntop=5)

    n.exp_embed_ba = L.Embed(n.exp, input_dim=exp_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    n.exp_embed = L.TanH(n.exp_embed_ba)

    # LSTM1 for Explanation
    n.exp_lstm1 = L.LSTM(\
                   n.exp_embed, n.exp_cont_1,\
                   recurrent_param=dict(\
                       num_output=2048,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))

    n.exp_lstm1_dropped = L.Dropout(n.exp_lstm1,
                                    dropout_param={'dropout_ratio': 0.3})

    # Merge with LSTM1 for explanation
    n.exp_att_resh = L.Reshape(
        n.exp_att_feature, reshape_param=dict(shape=dict(dim=[1, -1, 2048])))
    n.exp_att_tiled = L.Tile(n.exp_att_resh, axis=0, tiles=exp_T)
    n.exp_eltwise_all = L.Eltwise(n.exp_lstm1_dropped,
                                  n.exp_att_tiled,
                                  eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_all_sqrt = L.SignedSqrt(n.exp_eltwise_all)
    n.exp_eltwise_all_l2 = L.L2Normalize(n.exp_eltwise_all_sqrt)
    n.exp_eltwise_all_drop = L.Dropout(n.exp_eltwise_all_l2,
                                       dropout_param={'dropout_ratio': 0.3})

    # LSTM2 for Explanation
    n.exp_lstm2 = L.LSTM(\
                   n.exp_eltwise_all_drop, n.exp_cont_2,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    n.exp_lstm2_dropped = L.Dropout(n.exp_lstm2,
                                    dropout_param={'dropout_ratio': 0.3})

    n.exp_prediction = L.InnerProduct(n.exp_lstm2_dropped,
                                      num_output=exp_vocab_size,
                                      weight_filler=dict(type='xavier'),
                                      axis=2)

    n.silence_exp_prediction = L.Silence(n.exp_prediction, ntop=0)

    return n.to_proto()
def lstm(bottom, clip, nout, param_name=''):
    recurrent_param = dict(num_output=nout,
                           weight_filler=dict(type='uniform',
                                              min=-0.08,
                                              max=0.08))
    return L.LSTM(bottom,
                  clip,
                  recurrent_param=recurrent_param,
                  param=[
                      dict(name=param_name + '-w'),
                      dict(name=param_name + '-u'),
                      dict(name=param_name + '-b', decay_mult=0)
                  ])
Ejemplo n.º 5
0
def generate_scores(split, config):

    n = caffe.NetSpec()
    batch_size = config.N
    mode_str = str(dict(split=split, batch_size=batch_size))
    n.language, n.cont, n.img_feature, n.spatial, n.label = L.Python(module=config.data_provider,
                                                                     layer='TossLayer',
                                                                     param_str=mode_str,
                                                                     ntop=5)
    # embedding
    n.embed = L.Embed(n.language, input_dim=config.vocab_size,
                      num_output=config.embed_dim,
                      weight_filler=dict(type='uniform', min=-0.08, max=0.08))

    # LSTM
    n.lstm = L.LSTM(n.embed, n.cont,
                    recurrent_param=dict(num_output=config.lstm_dim,
                                         weight_filler=dict(type='uniform', min=-0.08, max=0.08),
                                         bias_filler=dict(type='constant', value=0)))
    tops = L.Slice(n.lstm, ntop=config.T, slice_param=dict(axis=0))
    for i in range(config.T - 1):
        n.__setattr__('slice'+str(i), tops[i])
        n.__setattr__('silence'+str(i), L.Silence(tops[i], ntop=0))
    n.lstm_out = tops[-1]
    n.lstm_feat = L.Reshape(n.lstm_out, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim])))

    # L2 Normalize image and language features
    n.img_l2norm = L.L2Normalize(n.img_feature)
    n.lstm_l2norm = L.L2Normalize(n.lstm_feat)
    n.img_l2norm_resh = L.Reshape(n.img_l2norm,
                                  reshape_param=dict(shape=dict(dim=[-1, config.D_im])))
    n.lstm_l2norm_resh = L.Reshape(n.lstm_l2norm,
                                  reshape_param=dict(shape=dict(dim=[-1, config.D_text])))

    # Concatenate
    n.feat_all = L.Concat(n.lstm_l2norm_resh, n.img_l2norm_resh, n.spatial, concat_param=dict(axis=1))

    # MLP Classifier over concatenated feature
    n.mlp_l1, n.mlp_relu1 = fc_relu(n.feat_all, config.mlp_hidden_dims)
    if config.mlp_dropout:
        n.mlp_drop1 = L.Dropout(n.mlp_relu1, dropout_ratio=0.5, in_place=True)
        n.scores = fc(n.mlp_drop1, 1)
    else:
        n.scores = fc(n.mlp_relu1, 1)

    # Loss Layer
    n.loss = L.SigmoidCrossEntropyLoss(n.scores, n.label)

    return n.to_proto()
Ejemplo n.º 6
0
    def language_model_lstm_no_embed(self,
                                     sent_bottom,
                                     cont_bottom,
                                     text_name='embedding_text',
                                     tag=''):

        lstm_lr = self.args.lstm_lr
        embedding_lr = self.args.language_embedding_lr

        lstm = L.LSTM(
            sent_bottom,
            cont_bottom,
            recurrent_param=dict(num_output=self.language_embedding_dim[0],
                                 weight_filler=self.uniform_weight_filler(
                                     -0.08, 0.08),
                                 bias_filler=self.constant_filler(0)),
            param=self.learning_params(
                [[lstm_lr, lstm_lr], [lstm_lr, lstm_lr], [lstm_lr, lstm_lr]],
                ['lstm1' + tag, 'lstm2' + tag, 'lstm3' + tag]))
        lstm_slices = L.Slice(lstm,
                              slice_point=self.params['sentence_length'] - 1,
                              axis=0,
                              ntop=2)
        self.n.tops['silence_cell_' + str(self.silence_count)] = L.Silence(
            lstm_slices[0], ntop=0)
        self.silence_count += 1
        top_lstm = L.Reshape(
            lstm_slices[1],
            shape=dict(dim=[-1, self.language_embedding_dim[0]]))
        top_text = L.InnerProduct(
            top_lstm,
            num_output=self.language_embedding_dim[1],
            weight_filler=self.uniform_weight_filler(-0.08, .08),
            bias_filler=self.constant_filler(0),
            param=self.learning_params(
                [[embedding_lr, embedding_lr], [embedding_lr * 2, 0]],
                ['lstm_embed1' + tag, 'lstm_embed_1b' + tag]))

        setattr(self.n, text_name, top_text)
        return top_text
Ejemplo n.º 7
0
def _make_module(model_path, in1_shape, time_step, batch_size, num_output):
    ns = caffe.NetSpec()
    ns.data1 = L.Input(name="data1", input_param={"shape": {"dim": in1_shape}})
    ns.data2 = L.ContinuationIndicator(name="data2",
                                       continuation_indicator_param={
                                           "time_step": time_step,
                                           "batch_size": batch_size
                                       })
    ns.lstm = L.LSTM(ns.data1,
                     ns.data2,
                     name="lstm",
                     recurrent_param={"num_output": num_output})

    with open(os.path.join(model_path, 'test.prototxt'), 'w') as f:
        f.write(str(ns.to_proto()))

    net = caffe.Net(f.name, caffe.TEST)
    for l in net.layers:
        for b in l.blobs:
            if np.count_nonzero(b.data) == 0:
                b.data[...] = np.random.randn(*b.data.shape)

    net.save(os.path.join(model_path, 'test.caffemodel'))
Ejemplo n.º 8
0
def jsonToPrototxt(net, net_name):
    # assumption: a layer can accept only one input blob
    # the data layer produces two blobs: data and label
    # the loss layer requires two blobs: <someData> and label
    # the label blob is hardcoded.
    # layers name have to be unique

    # custom DFS of the network
    input_dim = None

    def get_iterable(x):
        if isinstance(x, collections.Iterable):
            return x
        else:
            return (x, )

    stack = []
    layersProcessed = {}
    processOrder = []
    blobNames = {}
    for layerId in net:
        layersProcessed[layerId] = False
        blobNames[layerId] = {
            'bottom': [],
            'top': [],
        }
    blobId = 0

    def isProcessPossible(layerId):
        inputs = net[layerId]['connection']['input']
        for layerId in inputs:
            if layersProcessed[layerId] is False:
                return False
        return True

    # finding the data layer
    for layerId in net:
        if (net[layerId]['info']['type'] == 'Data'
                or net[layerId]['info']['type'] == 'Input'
                or net[layerId]['info']['type'] == 'HDF5Data'):
            stack.append(layerId)

    def changeTopBlobName(layerId, newName):
        blobNames[layerId]['top'] = newName

    while len(stack):

        i = len(stack) - 1

        while isProcessPossible(stack[i]) is False:
            i = i - 1

        layerId = stack[i]
        stack.remove(stack[i])

        inputs = net[layerId]['connection']['input']
        if len(inputs) > 0:
            if len(inputs) == 2 and (net[inputs[0]]['info']['phase'] is not None) \
                    and (net[inputs[1]]['info']['phase']):
                commonBlobName = blobNames[inputs[0]]['top']
                changeTopBlobName(inputs[1], commonBlobName)
                blobNames[layerId]['bottom'] = commonBlobName
            else:
                inputBlobNames = []
                for inputId in inputs:
                    inputBlobNames.extend(blobNames[inputId]['top'])
                blobNames[layerId]['bottom'] = inputBlobNames

        blobNames[layerId]['top'] = ['blob' + str(blobId)]
        blobId = blobId + 1

        for outputId in net[layerId]['connection']['output']:
            if outputId not in stack:
                stack.append(outputId)

        layersProcessed[layerId] = True
        processOrder.append(layerId)

    ns_train = caffe.NetSpec()
    ns_test = caffe.NetSpec()

    for layerId in processOrder:

        layer = net[layerId]
        layerParams = layer['params']
        layerType = layer['info']['type']
        layerPhase = layer['info']['phase']

        if (layerType == 'Data' or layerType == 'Input'):

            # This is temporary
            # Has to be improved later
            # If we have data layer then it is converted to input layer with some default dimensions
            '''
            data_param = {}
            if layerParams['source'] != '':
                data_param['source'] = layerParams['source']
                # hardcoding mnsit dataset -change this later
                if layerPhase is not None:
                    if int(layerPhase) == 0:
                        data_param['source'] = 'examples/mnist/mnist_train_lmdb'
                    elif int(layerPhase) == 1:
                        data_param['source'] = 'examples/mnist/mnist_test_lmdb'
            if layerParams['batch_size'] != '':
                data_param['batch_size'] = int(float(layerParams['batch_size']))
            if layerParams['backend'] != '':
                backend = layerParams['backend']
                if(backend == 'LEVELDB'):
                    backend = 0
                elif(backend == 'LMDB'):
                    backend = 1
                data_param['backend'] = backend
            transform_param = {}
            if layerParams['scale'] != '':
                transform_param['scale'] = float(layerParams['scale'])
            if layerPhase is not None:
                caffeLayer = get_iterable(L.Data(
                    ntop=1,
                    transform_param=transform_param,
                    data_param=data_param,
                    include={
                        'phase': int(layerPhase)
                    }))
                if int(layerPhase) == 0:
                    #for key, value in zip(blobNames[layerId]['top'] + ['label'], caffeLayer):
                    for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                        ns_train[key] = value
                elif int(layerPhase) == 1:
                    #for key, value in zip(blobNames[layerId]['top'] + ['label'], caffeLayer):
                    for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                        ns_test[key] = value
            else:
                for ns in (ns_train,ns_test):
                    caffeLayer = get_iterable(L.Data(
                        ntop=2,
                        transform_param=transform_param,
                        data_param=data_param))
                    #for key, value in zip(blobNames[layerId]['top'] + ['label'], caffeLayer):
                    for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                        ns[key] = value
            '''

            if 'dim' not in layerParams:
                layerParams['dim'] = '10,3,224,224'

            input_dim = layerParams['dim']

            if layerPhase is not None:
                caffeLayer = get_iterable(
                    L.Input(input_param={
                        'shape': {
                            'dim': map(int, layerParams['dim'].split(','))
                        }
                    },
                            include={'phase': int(layerPhase)}))
                if int(layerPhase) == 0:
                    # for key, value in zip(blobNames[layerId]['top'] + ['label'], caffeLayer):
                    for key, value in zip(blobNames[layerId]['top'],
                                          caffeLayer):
                        ns_train[key] = value
                elif int(layerPhase) == 1:
                    # for key, value in zip(blobNames[layerId]['top'] + ['label'], caffeLayer):
                    for key, value in zip(blobNames[layerId]['top'],
                                          caffeLayer):
                        ns_test[key] = value
            else:
                for ns in (ns_train, ns_test):
                    caffeLayer = get_iterable(
                        L.Input(
                            input_param={
                                'shape': {
                                    'dim': map(int, layerParams['dim'].split(
                                        ','))
                                }
                            }))
                    # for key, value in zip(blobNames[layerId]['top'] + ['label'], caffeLayer):
                    for key, value in zip(blobNames[layerId]['top'],
                                          caffeLayer):
                        ns[key] = value

        elif (layerType == 'Crop'):
            crop_param = {}

            if layerParams['axis'] != '':
                crop_param['axis'] = int(float(layerParams['axis']))
            if layerParams['offset'] != '':
                crop_param['offset'] = int(float(layerParams['offset']))

            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.Crop(*[ns[x] for x in blobNames[layerId]['bottom']],
                           crop_param=crop_param))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'Convolution'):

            convolution_param = {}
            if layerParams['kernel_h'] != '':
                convolution_param['kernel_h'] = int(
                    float(layerParams['kernel_h']))
            if layerParams['kernel_w'] != '':
                convolution_param['kernel_w'] = int(
                    float(layerParams['kernel_w']))
            if layerParams['stride_h'] != '':
                convolution_param['stride_h'] = int(
                    float(layerParams['stride_h']))
            if layerParams['stride_w'] != '':
                convolution_param['stride_w'] = int(
                    float(layerParams['stride_w']))
            if layerParams['num_output'] != '':
                convolution_param['num_output'] = int(
                    float(layerParams['num_output']))
            if layerParams['pad_h'] != '':
                convolution_param['pad_h'] = int(float(layerParams['pad_h']))
            if layerParams['pad_w'] != '':
                convolution_param['pad_w'] = int(float(layerParams['pad_w']))
            if layerParams['weight_filler'] != '':
                convolution_param['weight_filler'] = {}
                convolution_param['weight_filler']['type'] = layerParams[
                    'weight_filler']
            if layerParams['bias_filler'] != '':
                convolution_param['bias_filler'] = {}
                convolution_param['bias_filler']['type'] = layerParams[
                    'bias_filler']

            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.Convolution(
                        *[ns[x] for x in blobNames[layerId]['bottom']],
                        convolution_param=convolution_param,
                        param=[{
                            'lr_mult': 1
                        }, {
                            'lr_mult': 2
                        }]))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'Deconvolution'):

            convolution_param = {}
            if layerParams['kernel_h'] != '':
                convolution_param['kernel_h'] = int(
                    float(layerParams['kernel_h']))
            if layerParams['kernel_w'] != '':
                convolution_param['kernel_w'] = int(
                    float(layerParams['kernel_w']))
            if layerParams['stride_h'] != '':
                convolution_param['stride_h'] = int(
                    float(layerParams['stride_h']))
            if layerParams['stride_w'] != '':
                convolution_param['stride_w'] = int(
                    float(layerParams['stride_w']))
            if layerParams['num_output'] != '':
                convolution_param['num_output'] = int(
                    float(layerParams['num_output']))
            if layerParams['pad_h'] != '':
                convolution_param['pad_h'] = int(float(layerParams['pad_h']))
            if layerParams['pad_w'] != '':
                convolution_param['pad_w'] = int(float(layerParams['pad_w']))
            if layerParams['weight_filler'] != '':
                convolution_param['weight_filler'] = {}
                convolution_param['weight_filler']['type'] = layerParams[
                    'weight_filler']
            if layerParams['bias_filler'] != '':
                convolution_param['bias_filler'] = {}
                convolution_param['bias_filler']['type'] = layerParams[
                    'bias_filler']

            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.Deconvolution(
                        *[ns[x] for x in blobNames[layerId]['bottom']],
                        convolution_param=convolution_param,
                        param=[{
                            'lr_mult': 1
                        }, {
                            'lr_mult': 2
                        }]))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'ReLU'):
            inplace = layerParams['inplace']
            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.ReLU(*[ns[x] for x in blobNames[layerId]['bottom']],
                           in_place=inplace))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'Pooling'):

            pooling_param = {}
            if layerParams['kernel_h'] != '':
                pooling_param['kernel_h'] = int(float(layerParams['kernel_h']))
            if layerParams['kernel_w'] != '':
                pooling_param['kernel_w'] = int(float(layerParams['kernel_w']))
            if layerParams['stride_h'] != '':
                pooling_param['stride_h'] = int(float(layerParams['stride_h']))
            if layerParams['stride_w'] != '':
                pooling_param['stride_w'] = int(float(layerParams['stride_w']))
            if layerParams['pad_h'] != '':
                pooling_param['pad_h'] = int(float(layerParams['pad_h']))
            if layerParams['pad_w'] != '':
                pooling_param['pad_w'] = int(float(layerParams['pad_w']))
            if layerParams['pool'] != '':
                pool = layerParams['pool']
                if (pool == 'MAX'):
                    pool = 0
                elif (pool == 'AVE'):
                    pool = 1
                elif (pool == 'STOCHASTIC'):
                    pool = 2
                pooling_param['pool'] = pool

            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.Pooling(*[ns[x] for x in blobNames[layerId]['bottom']],
                              pooling_param=pooling_param))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'InnerProduct'):

            inner_product_param = {}
            if layerParams['num_output'] != '':
                inner_product_param['num_output'] = int(
                    float(layerParams['num_output']))
            if layerParams['weight_filler'] != '':
                inner_product_param['weight_filler'] = {}
                inner_product_param['weight_filler']['type'] = layerParams[
                    'weight_filler']
            if layerParams['bias_filler'] != '':
                inner_product_param['bias_filler'] = {}
                inner_product_param['bias_filler']['type'] = layerParams[
                    'bias_filler']

            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.InnerProduct(
                        *[ns[x] for x in blobNames[layerId]['bottom']],
                        inner_product_param=inner_product_param,
                        param=[{
                            'lr_mult': 1
                        }, {
                            'lr_mult': 2
                        }]))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'SoftmaxWithLoss'):
            pass
            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.SoftmaxWithLoss(  # try L['SoftmaxWithLoss']
                        *([ns[x] for x in blobNames[layerId]['bottom']])))
                # *([ns[x] for x in blobNames[layerId]['bottom']] + [ns.label])))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'Accuracy'):
            pass

            if layerPhase is not None:
                caffeLayer = get_iterable(
                    L.Accuracy(
                        *([ns[x] for x in blobNames[layerId]['bottom']]),
                        # *([ns[x] for x in blobNames[layerId]['bottom']] + [ns.label]),
                        include={'phase': int(layerPhase)}))
                if int(layerPhase) == 0:
                    for key, value in zip(blobNames[layerId]['top'],
                                          caffeLayer):
                        ns_train[key] = value
                elif int(layerPhase) == 1:
                    for key, value in zip(blobNames[layerId]['top'],
                                          caffeLayer):
                        ns_test[key] = value
            else:
                for ns in (ns_train, ns_test):
                    caffeLayer = get_iterable(
                        L.Accuracy(
                            *([ns[x] for x in blobNames[layerId]['bottom']])))
                    # *([ns[x] for x in blobNames[layerId]['bottom']] + [ns.label])))
                    for key, value in zip(blobNames[layerId]['top'],
                                          caffeLayer):
                        ns[key] = value

        elif (layerType == 'Dropout'):
            # inplace dropout? caffe-tensorflow do not work
            inplace = layerParams['inplace']
            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.Dropout(*[ns[x] for x in blobNames[layerId]['bottom']],
                              in_place=inplace))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'LRN'):
            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.LRN(*[ns[x] for x in blobNames[layerId]['bottom']]))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'Concat'):
            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.Concat(*[ns[x] for x in blobNames[layerId]['bottom']],
                             ntop=len(blobNames[layerId]['top'])))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'Eltwise'):
            eltwise_param = {}
            if layerParams['operation'] != '':
                elt = layerParams['operation']
                if (elt == 'PROD'):
                    elt = 0
                elif (elt == 'SUM'):
                    elt = 1
                elif (elt == 'MAX'):
                    elt = 2
            else:
                elt = 1  # Default is sum
            eltwise_param['operation'] = elt
            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.Eltwise(*[ns[x] for x in blobNames[layerId]['bottom']],
                              eltwise_param=eltwise_param))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'Softmax'):
            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.Softmax(*([ns[x]
                                 for x in blobNames[layerId]['bottom']])))
                # *([ns[x] for x in blobNames[layerId]['bottom']] + [ns.label])))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'Embed'):
            for ns in (ns_train, ns_test):
                print ns.tops
                caffeLayer = get_iterable(
                    L.Embed(*[ns[x] for x in blobNames[layerId]['bottom']],
                            param=[{
                                'lr_mult': 1,
                                'decay_mult': 1
                            }, {
                                'lr_mult': 2,
                                'decay_mult': 0
                            }]))
                # *([ns[x] for x in blobNames[layerId]['bottom']] + [ns.label])))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'LSTM'):
            recurrent_param = {}
            if layerParams['num_output'] != '':
                recurrent_param['num_output'] = int(layerParams['num_output'])
            if layerParams['weight_filler'] != '':
                recurrent_param['weight_filler'] = {
                    'type': layerParams['weight_filler']
                }
            if layerParams['bias_filler'] != '':
                recurrent_param['bias_filler'] = {
                    'type': layerParams['bias_filler']
                }
            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.LSTM(*[ns[x] for x in blobNames[layerId]['bottom']],
                           recurrent_param=recurrent_param))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'Reshape'):
            reshape_param = {
                'shape': {
                    'dim': map(int, layerParams['dim'].split(','))
                }
            }
            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.Reshape(*[ns[x] for x in blobNames[layerId]['bottom']],
                              reshape_param=reshape_param))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'HDF5Data'):
            layerPhase = layer['info']['phase']
            hdf5_data_param = {}
            if layerParams['source'] != '':
                hdf5_data_param['source'] = layerParams['source']
            if layerParams['batch_size'] != '':
                hdf5_data_param['batch_size'] = layerParams['batch_size']
            for ns in (ns_train, ns_test):
                if layerPhase is not None:
                    caffeLayer = get_iterable(
                        L.HDF5Data(
                            *[ns[x] for x in blobNames[layerId]['bottom']],
                            hdf5_data_param=hdf5_data_param,
                            include={'phase': int(layerPhase)}))
                else:
                    caffeLayer = get_iterable(
                        L.HDF5Data(
                            *[ns[x] for x in blobNames[layerId]['bottom']],
                            hdf5_data_param=hdf5_data_param))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'BatchNorm'):
            batch_norm_param = {}
            if layerParams['use_global_stats'] != '':
                batch_norm_param['use_global_stats'] = layerParams[
                    'use_global_stats']
            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.BatchNorm(*[ns[x] for x in blobNames[layerId]['bottom']],
                                batch_norm_param=batch_norm_param))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

        elif (layerType == 'Scale'):
            scale_param = {}
            if layerParams['bias_term'] != '':
                scale_param['bias_term'] = layerParams['bias_term']
            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.Scale(*[ns[x] for x in blobNames[layerId]['bottom']],
                            scale_param=scale_param))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value
        elif (layerType == 'Eltwise'):
            eltwise_param = {}
            if layerParams['operation'] != '':
                eltwise_param['operation'] = int(layerParams['operation'])
            for ns in (ns_train, ns_test):
                caffeLayer = get_iterable(
                    L.Eltwise(*[ns[x] for x in blobNames[layerId]['bottom']],
                              eltwise_param=eltwise_param))
                for key, value in zip(blobNames[layerId]['top'], caffeLayer):
                    ns[key] = value

    train = 'name: "' + net_name + '"\n' + str(ns_train.to_proto())
    test = str(ns_test.to_proto())

    # merge the train and test prototxt to get a single train_test prototxt
    testIndex = [m.start() for m in re.finditer('layer', test)]

    previousIndex = -1
    for i in range(len(testIndex)):
        if i < len(testIndex) - 1:
            layer = test[testIndex[i]:testIndex[i + 1]]
        else:
            layer = test[testIndex[i]:]
        a = train.find(layer)
        if a != -1:
            l = test[testIndex[previousIndex + 1]:testIndex[i]]
            train = train[0:a] + l + train[a:]
            previousIndex = i
    if previousIndex < len(testIndex) - 1:
        l = test[testIndex[previousIndex + 1]:]
        train = train + l

    prototxt = train

    return prototxt, input_dim
Ejemplo n.º 9
0
def generator_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size, use_gt=True):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode':mode, 'batchsize':batchsize})
    n.data, n.cont, n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \
        L.Python(module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=8)

    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08), param=fixed_weights)
    n.embed = L.TanH(n.embed_ba) 

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)),
                   param=fixed_weights_lstm)
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis':0})
    for i in range(T-1):
        n.__setattr__('slice_first'+str(i), tops1[int(i)])
        n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
    n.lstm1_out = tops1[T-1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':0.3})
    n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)),
                   param=fixed_weights_lstm)
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis':0})
    for i in range(T-1):
        n.__setattr__('slice_second'+str(i), tops2[int(i)])
        n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0))
    n.lstm2_out = tops2[T-1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    # Tile question feature
    n.q_emb_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.q_emb_tiled_1 = L.Tile(n.q_emb_resh, axis=2, tiles=14)
    n.q_emb_resh_tiled = L.Tile(n.q_emb_tiled_1, axis=3, tiles=14)

    # Embed image feature
    n.i_emb = L.Convolution(n.img_feature, kernel_size=1, stride=1,
                            num_output=2048, pad=0, weight_filler=dict(type='xavier'),
                            param=fixed_weights)

    # Eltwise product and normalization
    n.eltwise = L.Eltwise(n.q_emb_resh_tiled, n.i_emb, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise_sqrt = L.SignedSqrt(n.eltwise)
    n.eltwise_l2 = L.L2Normalize(n.eltwise_sqrt)
    n.eltwise_drop = L.Dropout(n.eltwise_l2, dropout_param={'dropout_ratio': 0.3})

    # Attention for VQA
    n.att_conv1 = L.Convolution(n.eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att_map = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14])))
    
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    n.att_feature  = L.SoftAttention(n.img_feature, n.att_map, dummy)
    n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,2048])))

    # eltwise product + normalization again for VQA
    n.i_emb2 = L.InnerProduct(n.att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.eltwise2 = L.Eltwise(n.lstm_12, n.i_emb2, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise2_sqrt = L.SignedSqrt(n.eltwise2)
    n.eltwise2_l2 = L.L2Normalize(n.eltwise2_sqrt)
    n.eltwise2_drop = L.Dropout(n.eltwise2_l2, dropout_param={'dropout_ratio': 0.3})

    n.prediction = L.InnerProduct(n.eltwise2_drop, num_output=3000, weight_filler=dict(type='xavier'), param=fixed_weights)

    # Take GT answer or Take the logits of the VQA model and get predicted answer to embed
    if use_gt:
        n.exp_emb_ans = L.Embed(n.label, input_dim=3000, num_output=300,
            weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    else:
        n.vqa_ans = L.ArgMax(n.prediction, axis=1)
        n.exp_emb_ans = L.Embed(n.vqa_ans, input_dim=3000, num_output=300,
            weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans)
    n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh, num_output=2048, weight_filler=dict(type='xavier'))

    # Merge VQA answer and visual+textual feature
    n.exp_emb_resh = L.Reshape(n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14)
    n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14)

    #n.exp_eltwise = L.Eltwise(n.eltwise_drop,  n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise_emb = L.Convolution(n.eltwise, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier'))
    n.exp_eltwise = L.Eltwise(n.eltwise_emb,  n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise)
    n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt)
    n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2, dropout_param={'dropout_ratio': 0.3})

    # Attention for Explanation
    n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'))
    n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1)
    n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'))
    n.exp_att_reshaped = L.Reshape(n.exp_att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14])))
    n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2)
    n.exp_att_map = L.Reshape(n.exp_att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14])))
    
    exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    n.exp_att_feature_prev  = L.SoftAttention(n.img_feature, n.exp_att_map, exp_dummy)
    n.exp_att_feature_resh = L.Reshape(n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'))
    n.exp_lstm12_embed = L.InnerProduct(n.lstm_12, num_output=2048, weight_filler=dict(type='xavier'))
    n.exp_eltwise2 = L.Eltwise(n.exp_lstm12_embed, n.exp_att_feature_embed, eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_att_feature = L.Eltwise(n.exp_emb_ans2, n.exp_eltwise2, eltwise_param={'operation': P.Eltwise.PROD})

    n.silence_exp_att = L.Silence(n.exp_att_feature, ntop=0)

    return n.to_proto()
Ejemplo n.º 10
0
def qlstm(mode, batchsize, T, question_vocab_size):

    #prototxt 없이 network 생성시 사용
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})

    #지정된 Python 모듈 형식
    #https://stackoverflow.com/questions/41344168/what-is-a-python-layer-in-caffe
    #해당 클래스를 바탕으로 Layer를 생성하며
    #리턴된 변수에 값을 채워넣으면 자동으로 Run된다.
    #여기서 만들어진 Class 내부에서 실질적인 databatch load가 이루어짐.

    #Glove = Global vectors for word representation
    #https://www.aclweb.org/anthology/D14-1162
    #Pretrained 된 GloveVector를 Concat에 사용.

    #img_feature는 이미 Resnet512 통과후 L2를 적용한 Preprocessing이 끝난 상태의 Feature Vector.

    n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )
    #module = python 파일이름
    #layer = layer형식이 맞춰진 python class
    #param_str = json으로 Data Load시 사용된 파라미터, 내부 class에 self.param_str = modestr 로 저장된다
    #ntop = 각 setup , forward backward의 top 변수의 크기

    #보통 textual Embed의 뜻은 => texture -> number
    #Embed 3000개의 Vector종류를
    #300개로 compact하게 표현함
    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    #Tanh 적용
    n.embed = L.TanH(n.embed_ba)
    #Glove Data와 Concat
    concat_word_embed = [n.embed, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # T x N x 600

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[T - 1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0})

    #https://www.programcreek.com/python/example/107865/caffe.NetSpec 참조.
    # give top2[~] the name specified by argument `slice_second`
    #변수 부여 기능
    for i in xrange(T - 1):
        n.__setattr__('slice_second' + str(i), tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(tops2[int(i)], ntop=0))

    #마지막 LSTM output을 사용.
    n.lstm2_out = tops2[T - 1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    #lstm1의 output => 1024 reshape뒤 dropout
    #lstm2의 output => 1024 reshape뒤 dropout
    #concat

    n.q_emb_tanh_droped_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    #L.Tile 차원을 자동으로 안맞춰주므로 차원맞춤 함수. 2048,1 (tile=14, axis=1)  =>2048,14
    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)

    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))

    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    #논문 그림과 달리 Dropout 추가
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    #논문 그림과 달리 output dim이 2
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    #softmax로 attentionmap 생성
    #14x14 Softmax map이 2개 생성

    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    #두가지 att_map을 각각 Slice
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]

    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)
    #각각 ATT를 곱한값을 연산뒤 Concat한다.

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #그뒤 4096으로 Reshape

    n.lstm_12_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))

    #논문과 달리 가로축 세로축 inputVector크기가 다름
    #논문 2048 2048
    #코드 4096 2048
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.lstm_12_resh,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    #SignedSqrt
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    #L2_Normalize
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    #Dropout
    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    #FullyConnected
    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))

    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
Ejemplo n.º 11
0
def mfh_baseline(mode, batchsize, T, question_vocab_size, folder):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
    if mode == 'val':
        n.data, n.cont, n.img_feature, n.label = L.Python( \
            module='vqa_data_layer', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=4 )
    else:
        n.data, n.cont, n.img_feature, n.label = L.Python(\
            module='vqa_data_layer_kld', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=4 ) 
    n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
                         weight_filler=dict(type='xavier'))
    n.embed_tanh = L.TanH(n.embed) 

    # LSTM
    #n.lstm1 = L.LSTM(\
    #               n.embed_tanh, n.cont,\
    #               recurrent_param=dict(\
    #                   num_output=config.LSTM_UNIT_NUM,\
    #                   weight_filler=dict(type='xavier')))
    #tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
    #for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
    #    n.__setattr__('slice_first'+str(i), tops1[int(i)])
    #    n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
    #n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1]
    #n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
    #                      reshape_param=dict(\
    #                          shape=dict(dim=[-1,1024])))
    #n.q_feat = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    n.lstm1 = L.LSTM(\
                   n.embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=config.LSTM_UNIT_NUM,\
                       weight_filler=dict(type='xavier')))
    tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
    for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
        n.__setattr__('slice_first'+str(i), tops1[int(i)])
        n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
    n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})

    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=config.LSTM_UNIT_NUM, 
                       weight_filler=dict(type='xavier')))
    tops2 = L.Slice(n.lstm2, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
    for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
        n.__setattr__('slice_second'+str(i), tops2[int(i)])
        n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0))
    n.lstm2_out = tops2[config.MAX_WORDS_IN_QUESTION-1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.q_feat = L.Concat(*concat_botom) 
    '''
    Coarse Image-Question MFH fusion
    '''

    n.mfb_q_o2_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE,
                                  weight_filler=dict(type='xavier'))
    n.mfb_i_o2_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE,
                                  weight_filler=dict(type='xavier'))
    n.mfb_iq_o2_eltwise = L.Eltwise(n.mfb_q_o2_proj, n.mfb_i_o2_proj, eltwise_param=dict(operation=0))
    n.mfb_iq_o2_drop = L.Dropout(n.mfb_iq_o2_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.mfb_iq_o2_resh = L.Reshape(n.mfb_iq_o2_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.mfb_iq_o2_sumpool = L.Pooling(n.mfb_iq_o2_resh, pool=P.Pooling.SUM, \
                                      pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.mfb_o2_out = L.Reshape(n.mfb_iq_o2_sumpool,\
                                    reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
    n.mfb_o2_sign_sqrt = L.SignedSqrt(n.mfb_o2_out)
    n.mfb_o2_l2 = L.L2Normalize(n.mfb_o2_sign_sqrt)

    n.mfb_q_o3_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE,
                                  weight_filler=dict(type='xavier'))
    n.mfb_i_o3_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE,
                                  weight_filler=dict(type='xavier'))
    n.mfb_iq_o3_eltwise = L.Eltwise(n.mfb_q_o3_proj, n.mfb_i_o3_proj,n.mfb_iq_o2_drop, eltwise_param=dict(operation=0))
    n.mfb_iq_o3_drop = L.Dropout(n.mfb_iq_o3_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.mfb_iq_o3_resh = L.Reshape(n.mfb_iq_o3_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.mfb_iq_o3_sumpool = L.Pooling(n.mfb_iq_o3_resh, pool=P.Pooling.SUM, \
                                      pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.mfb_o3_out = L.Reshape(n.mfb_iq_o3_sumpool,\
                                    reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
    n.mfb_o3_sign_sqrt = L.SignedSqrt(n.mfb_o3_out)
    n.mfb_o3_l2 = L.L2Normalize(n.mfb_o3_sign_sqrt)

    n.mfb_o23_l2 = L.Concat(n.mfb_o2_l2,n.mfb_o3_l2)

    n.prediction = L.InnerProduct(n.mfb_o23_l2, num_output=config.NUM_OUTPUT_UNITS,
                                  weight_filler=dict(type='xavier'))

    if mode == 'val':
        n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    else:
        n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 
    return n.to_proto()
def creatNet(data_path,
             batch_size,
             DB_NAME_SAMPLES,
             DB_NAME_LABELS,
             DB_NAME_CLIP_MARKERS,
             DB_NAME_LOGICAL_LABELS,
             DB_NAME_SAMPLE_INDEX,
             image_height,
             image_width,
             channel_num,
             image_num_per_sequence):

    
    sequence_num_per_batch = batch_size / image_num_per_sequence
    
    # Current net structure
    # samples        labels          clip_markers
    # Convolution    Reshape         Reshape
    # ReLu
    # Pooling
    # InnerProduct
    # ReLu
    # DropOut
    # Reshape
    #                       Lstm
    
    # DEFINE THE NETWORK ARCHETECTURE
    net = caffe.NetSpec()
    
    # DATA LAYER
    # SAMPLE AND LABEL
    net.samples = L.Data(batch_size=batch_size,
                         backend=P.Data.LMDB,
                         source=DB_NAME_SAMPLES,
                         transform_param = {'scale': 0.00390625})
    
 
    
    net.labels = L.Data(batch_size=batch_size,
                        backend=P.Data.LMDB,
                        source=DB_NAME_LABELS)
    
    net.clip_markers = L.Data(batch_size=batch_size,
                              backend=P.Data.LMDB,
                              source=DB_NAME_CLIP_MARKERS)
    
    net.sample_indexes = L.Data(batch_size=batch_size,
                              backend=P.Data.LMDB,
                              source=DB_NAME_SAMPLE_INDEX)
    
    # Adding layers 20160809
    # Batch Normalization
    
    net.batch_normalization_1 = L.BatchNorm(net.samples)
    
    net.scale_1 = L.Scale(net.batch_normalization_1,
                          scale_param = {'bias_term': True})
    
    
    
    net.relu_3 = L.ReLU(net.scale_1)
    
    # Adding layers 2016-08-08
    # Convolution Later
    # Bottom:samples
    
    net.convolution_1 = L.Convolution(net.relu_3,
                                      param=[{'lr_mult': 1, 'decay_mult': 1}, {'lr_mult': 2, 'decay_mult': 0}],
                                      convolution_param={'num_output': 96,
                                                           'kernel_size': 7,
                                                           'stride': 2,
                                                           'weight_filler': {'type': 'gaussian', 'std': 0.01},
                                                           'bias_filler': {'type': 'constant', 'value': 0.1}})
    
    

    
    # Pooling
    net.pooling_1 = L.Pooling(net.convolution_1,
                              pooling_param={'pool': P.Pooling.MAX,
                                               'kernel_size': 3,
                                               'stride': 2})
    
    net.batch_normalization_2 = L.BatchNorm(net.pooling_1)
    
    net.scale_2 = L.Scale(net.batch_normalization_2,
                          scale_param = {'bias_term': True})
    
    
        # ReLu
    net.relu_1 = L.ReLU(net.scale_2)

    # Inner Product
    net.inner_product_1 = L.InnerProduct(net.relu_1,
                                         param=[{'lr_mult': 1, 'decay_mult': 1}, 
                                                {'lr_mult': 2, 'decay_mult': 0}],
                                         inner_product_param={'num_output': 4096,
                                                              'weight_filler': {'type': 'gaussian', 'std': 0.01},
                                                              'bias_filler': {'type': 'constant', 'value': 0.1}})
    
    # ReLu
    net.relu_2 = L.ReLU(net.inner_product_1)
    
    
    
    net.dropout_1 = L.Dropout(net.relu_2,
                              dropout_param={'dropout_ratio': 0.9})
    
    
    
    
    
    
    
    
    net.reshape_sample_1 = L.Reshape(net.dropout_1,
                                     reshape_param={ 'shape': {'dim': [image_num_per_sequence, sequence_num_per_batch, 4096] } })
        
    
    # Reshaple lable
    net.reshape_label_1 = L.Reshape(net.labels,
                                    reshape_param={ 'shape': {'dim': [image_num_per_sequence, sequence_num_per_batch]}})
    
    # Reshape clip markers
    net.reshape_clip_markers_1 = L.Reshape(net.clip_markers,
                                           reshape_param={ 'shape': {'dim': [image_num_per_sequence, sequence_num_per_batch]}})

    
    # LSTM network
    
    net.lstm_1 = L.LSTM(net.reshape_sample_1,
                        net.reshape_clip_markers_1,
                        recurrent_param={'num_output': 256,
                                           'weight_filler': {'type': 'uniform', 'min':-0.01, 'max': 0.01},
                                           'bias_filler': {'type': 'constant', 'value': 0 }})
    
    
    
    net.dropout_2 = L.Dropout(net.lstm_1,
                              dropout_param={'dropout_ratio': 0.5})
    
    
    
    # INNERPRODUCT LAYER
    net.inner_product_2 = L.InnerProduct(net.lstm_1,
                                         param=[{'lr_mult': 1, 'decay_mult': 1}, 
                                                {'lr_mult': 2, 'decay_mult': 0}],
                                         inner_product_param={'num_output': 10,                                                             
                                                              'weight_filler': {'type': 'gaussian', 'std': 0.01},
                                                              'bias_filler': {'type': 'constant', 'value': 0},
                                                              'axis': -2})
    
    net.slice_ip12_1 = L.Slice()


#     
#     net.inner_product_3 = L.InnerProduct(net.inner_product_2,
#                                          inner_product_param={'num_output': 1,                                                             
#                                                               'weight_filler': {'type': 'gaussian', 'std': 0.01},
#                                                               'bias_filler': {'type': 'constant', 'value': 0},
#                                                               'axis': 0})
#     
#     
# 
#     
#     [net.slice_label_1, net.slice_label_2] = L.Slice(net.reshape_label_1,
#                                                      ntop = 2,
#                                                      slice_param = {'axis': 0,
#                                                                     'slice_point': 1})
#     
#     
#     net.reshape_slice_label_1 = L.Reshape(net.slice_label_1,
#                                  reshape_param={ 'shape': {'dim': [1, sequence_num_per_batch]}})
    
    
    
    
    # LOSS LAYER
    #net.loss = L.SoftmaxWithLoss(net.inner_product_3,
     #                            net.reshape_slice_label_1)
    
    # Accuracy layer
    #net.accuracy = L.Accuracy(net.inner_product_3,
    #                          net.reshape_slice_label_1)
    

    # RESHAPE LAYER
    # WHY RESHAPE?
    # Data from database looks like:
    # sample (sequence = 1, time = 1)
    # sample (sequence = 1, time = 2)
    #                .
    #                .
    #                .
    # sample (sequence = 1, time = T)
    # sample (sequence = 2, time = 1)
    #
    # Thus for feeding the LSTM, the data should like :

    # sample (s = 1, t = 1), sample (s = 2, t = 1), sample (s = 3 ,t = 1),  ...  sample (s = N, t = 1)
    # sample (s = 1, t = 2), sample (s = 2, t = 2), sample (s = 3, t = 2),  ...  sample (s = N, t = 2)
    #                                        .
    #                                        .
    # sample (s = 1, t = T), sample (s = 2, t = T), sample (s = 3, t = T),  ...  sample (s = N, t = T)                                        .
    
    # RESHAPE SHOULE BE TWICE
    # WHY?
    # Because the caffe build-in reshape is line-prioritize filled
    
    # THE 1st RESHAPE
    
    # SAMPLES RESHAPE LAYER
    # input shape (raw sample shape): a blob of (T * N) * h * w
    # desired output shape: a blob of 
    return net
Ejemplo n.º 13
0
def dis_net(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2, n.dis_label = L.Python(
        module='vqa_data_provider_layer',
        layer='VQADataProviderLayer',
        param_str=mode_str,
        ntop=9)

    n.vqa_exp_emb = L.Embed(
        n.exp,
        input_dim=exp_vocab_size,
        num_output=exp_vocab_size,
        weight_filler=dict(type='uniform', min=-0.08,
                           max=0.08))  #n.vqa_exp=batchxseqx exp_vocab_size
    n.vqa_exp = L.TanH(n.vqa_exp_emb)
    n.vqa_exp_reshape = L.Reshape(
        n.vqa_exp, reshape_param=dict(shape=dict(dim=[-1, exp_vocab_size])))
    n.exp_embed_ba = L.InnerProduct(n.vqa_exp_reshape,
                                    num_output=300,
                                    weight_filler=dict(type='xavier'))
    n.exp_embed_ba_reshape = L.Reshape(
        n.exp_embed_ba,
        reshape_param=dict(shape=dict(dim=[-1, batchsize, 300])))
    n.exp_embed = L.TanH(n.exp_embed_ba_reshape)

    # Embed VQA GT answer during training
    n.vqa_ans = L.Embed(
        n.label,
        input_dim=3000,
        num_output=3000,
        weight_filler=dict(type='uniform', min=-0.08,
                           max=0.08))  #n.vqa_ans=batchxseqx3000
    n.exp_emb_ans = L.InnerProduct(n.vqa_ans,
                                   num_output=300,
                                   weight_filler=dict(type='xavier'))
    n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans)
    n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh,
                                    num_output=2048,
                                    weight_filler=dict(type='xavier'))

    # Exp LSTM1
    n.exp_lstm1 = L.LSTM(n.exp_embed,
                         n.exp_cont_1,
                         recurrent_param=dict(num_output=1024,
                                              weight_filler=dict(
                                                  type='uniform',
                                                  min=-0.08,
                                                  max=0.08),
                                              bias_filler=dict(type='constant',
                                                               value=0)),
                         param=fixed_weights_lstm)
    exp_tops1 = L.Slice(n.exp_lstm1, ntop=exp_T, slice_param={'axis': 0})
    for i in range(T - 1):
        n.__setattr__('slice_first' + str(i), exp_tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(exp_tops1[int(i)], ntop=0))
    n.exp_lstm1_out = exp_tops1[T - 1]
    n.exp_lstm1_reshaped = L.Reshape(
        n.exp_lstm1_out, reshape_param=dict(shape=dict(dim=[-1, 1024])))
    n.exp_lstm1_reshaped_droped = L.Dropout(
        n.exp_lstm1_reshaped, dropout_param={'dropout_ratio': 0.3})
    n.exp_lstm1_droped = L.Dropout(n.exp_lstm1,
                                   dropout_param={'dropout_ratio': 0.3})

    # Exp LSTM2
    n.exp_lstm2 = L.LSTM(n.exp_lstm1_droped,
                         n.exp_cont_2,
                         recurrent_param=dict(num_output=1024,
                                              weight_filler=dict(
                                                  type='uniform',
                                                  min=-0.08,
                                                  max=0.08),
                                              bias_filler=dict(type='constant',
                                                               value=0)),
                         param=fixed_weights_lstm)
    exp_tops2 = L.Slice(n.exp_lstm2, ntop=exp_T, slice_param={'axis': 0})
    for i in range(T - 1):
        n.__setattr__('slice_second' + str(i), exp_tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(exp_tops2[int(i)], ntop=0))
    n.exp_lstm2_out = exp_tops2[T - 1]
    n.exp_lstm2_reshaped = L.Reshape(
        n.exp_lstm2_out, reshape_param=dict(shape=dict(dim=[-1, 1024])))
    n.exp_lstm2_reshaped_droped = L.Dropout(
        n.exp_lstm2_reshaped, dropout_param={'dropout_ratio': 0.3})
    concat_botom = [n.exp_lstm1_reshaped_droped, n.exp_lstm2_reshaped_droped]
    n.exp_lstm_12 = L.Concat(*concat_botom)

    #To concat ans and exp
    concat_ans_exp = [n.exp_emb_ans2, n.exp_lstm_12]
    n.concat_ans_exp_layer = L.Concat(*concat_ans_exp)
    n.concat_ans_exp_layer_dis = L.InnerProduct(
        n.concat_ans_exp_layer,
        num_output=1,
        weight_filler=dict(type='xavier'))
    n.discr_loss = L.SoftmaxWithLoss(n.concat_ans_exp_layer_dis,
                                     n.dis_label,
                                     loss_param=dict(ignore_label=-1))

    return n.to_proto()
Ejemplo n.º 14
0
def pj_x(mode, batchsize, exp_T, exp_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \
        L.Python(module='activity_data_provider_layer',
                 layer='ActivityDataProviderLayer',
                 param_str=mode_str, ntop=6)

    # Attention
    n.att_conv1 = L.Convolution(n.img_feature,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=1,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 1, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att_map = L.Reshape(n.att_softmax,
                          reshape_param=dict(shape=dict(dim=[-1, 1, 14, 14])))

    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature = L.SoftAttention(n.img_feature, n.att_map, dummy)
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 2048])))

    # Prediction
    n.prediction = L.InnerProduct(n.att_feature_resh,
                                  num_output=config.NUM_OUTPUT_UNITS,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    n.accuracy = L.Accuracy(n.prediction, n.label)

    # Embed Activity GT answer during training
    n.exp_emb_ans = L.Embed(n.label, input_dim=config.NUM_OUTPUT_UNITS, num_output=300, \
        weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans)
    n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh,
                                    num_output=2048,
                                    weight_filler=dict(type='xavier'))

    # merge activity answer and visual feature
    n.exp_emb_resh = L.Reshape(
        n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14)
    n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14)

    n.img_embed = L.Convolution(n.img_feature,
                                kernel_size=1,
                                stride=1,
                                num_output=2048,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.exp_eltwise = L.Eltwise(n.img_embed,
                              n.exp_emb_tiled,
                              eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise)
    n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt)
    n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2,
                                   dropout_param={'dropout_ratio': 0.3})

    # Attention for Explanation
    n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop,
                                    kernel_size=1,
                                    stride=1,
                                    num_output=512,
                                    pad=0,
                                    weight_filler=dict(type='xavier'))
    n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1)
    n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu,
                                    kernel_size=1,
                                    stride=1,
                                    num_output=1,
                                    pad=0,
                                    weight_filler=dict(type='xavier'))
    n.exp_att_reshaped = L.Reshape(
        n.exp_att_conv2, reshape_param=dict(shape=dict(dim=[-1, 1, 14 * 14])))
    n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2)
    n.exp_att_map = L.Reshape(
        n.exp_att_softmax, reshape_param=dict(shape=dict(dim=[-1, 1, 14, 14])))

    exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                            data_filler=dict(type='constant', value=1),
                            ntop=1)
    n.exp_att_feature_prev = L.SoftAttention(n.img_feature, n.exp_att_map,
                                             exp_dummy)
    n.exp_att_feature_resh = L.Reshape(
        n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh,
                                             num_output=2048,
                                             weight_filler=dict(type='xavier'))
    n.exp_att_feature = L.Eltwise(n.exp_emb_ans2,
                                  n.exp_att_feature_embed,
                                  eltwise_param={'operation': P.Eltwise.PROD})

    # Embed explanation
    n.exp_embed_ba = L.Embed(n.exp, input_dim=exp_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    n.exp_embed = L.TanH(n.exp_embed_ba)

    # LSTM1 for Explanation
    n.exp_lstm1 = L.LSTM(\
                   n.exp_embed, n.exp_cont_1,\
                   recurrent_param=dict(\
                       num_output=2048,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))

    n.exp_lstm1_dropped = L.Dropout(n.exp_lstm1,
                                    dropout_param={'dropout_ratio': 0.3})

    # merge with LSTM1 for explanation
    n.exp_att_resh = L.Reshape(
        n.exp_att_feature, reshape_param=dict(shape=dict(dim=[1, -1, 2048])))
    n.exp_att_tiled = L.Tile(n.exp_att_resh, axis=0, tiles=exp_T)
    n.exp_eltwise_all = L.Eltwise(n.exp_lstm1_dropped,
                                  n.exp_att_tiled,
                                  eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_all_l2 = L.L2Normalize(n.exp_eltwise_all)
    n.exp_eltwise_all_drop = L.Dropout(n.exp_eltwise_all_l2,
                                       dropout_param={'dropout_ratio': 0.3})

    # LSTM2 for Explanation
    n.exp_lstm2 = L.LSTM(\
                   n.exp_eltwise_all_drop, n.exp_cont_2,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    n.exp_lstm2_dropped = L.Dropout(n.exp_lstm2,
                                    dropout_param={'dropout_ratio': 0.3})

    n.exp_prediction = L.InnerProduct(n.exp_lstm2_dropped,
                                      num_output=exp_vocab_size,
                                      weight_filler=dict(type='xavier'),
                                      axis=2)

    n.exp_loss = L.SoftmaxWithLoss(n.exp_prediction,
                                   n.exp_out,
                                   loss_param=dict(ignore_label=-1),
                                   softmax_param=dict(axis=2))
    n.exp_accuracy = L.Accuracy(n.exp_prediction,
                                n.exp_out,
                                axis=2,
                                ignore_label=-1)

    return n.to_proto()
Ejemplo n.º 15
0
    def unpack_item(self,
                    layer,
                    previous_image_size,
                    layer_number,
                    bottom,
                    label=None):

        if layer.terminate == 1:
            # Softmax Accuracy/Loss
            # loss = cl.SoftmaxWithLoss(bottom, label)
            bottom = cl.InnerProduct(bottom,
                                     num_output=self.hp.NUM_CLASSES,
                                     weight_filler=dict(type='xavier'))
            return bottom

        if layer.layer_type == 'conv':
            out_depth = layer.filter_depth
            kernel_size = layer.filter_size
            stride = layer.stride
            pad = self.get_pad(kernel_size)
            bottom = cl.Convolution(bottom,
                                    kernel_size=kernel_size,
                                    num_output=out_depth,
                                    stride=stride,
                                    pad=pad,
                                    weight_filler=dict(type='xavier'))
            if self.ssp.batch_norm:
                bottom = self.add_batchnorm(bottom)
            return self.add_activate(bottom)

        if layer.layer_type == 'nin':
            out_depth = layer.filter_depth
            bottom = cl.Convolution(bottom,
                                    kernel_size=1,
                                    num_output=out_depth,
                                    weight_filler=dict(type='xavier'))
            bottom = self.add_activate(bottom)

            bottom = cl.Convolution(bottom,
                                    kernel_size=1,
                                    num_output=out_depth,
                                    weight_filler=dict(type='xavier'))
            bottom = self.add_activate(bottom)
            return bottom

        if layer.layer_type == 'gap':
            out_depth = self.hp.NUM_CLASSES
            bottom = cl.Convolution(bottom,
                                    kernel_size=1,
                                    num_output=out_depth,
                                    weight_filler=dict(type='xavier'))
            bottom = self.add_activate(bottom)
            bottom = cl.Pooling(bottom,
                                kernel_size=previous_image_size,
                                pool=P.Pooling.AVE)
            return bottom

        if layer.layer_type == 'fc':
            num_output = layer.fc_size
            bottom = cl.InnerProduct(bottom,
                                     num_output=num_output,
                                     weight_filler=dict(type='xavier'))
            bottom = self.add_activate(bottom)
            return bottom

        if layer.layer_type == 'dropout':
            dropout_ratio = 0.5 * float(layer.filter_depth) / layer.fc_size
            return cl.Dropout(bottom, dropout_ratio=dropout_ratio)

        if layer.layer_type == 'pool':
            kernel_size = layer.filter_size
            stride = layer.stride
            if self.ssp.batch_norm:
                bottom = self.add_batchnorm(bottom)
            return cl.Pooling(bottom,
                              kernel_size=kernel_size,
                              stride=stride,
                              pool=P.Pooling.MAX)
        if layer.layer_type == 'lstm':
            bottom = cl.LSTM(bottom,
                             weight_filler=dict(type='xavier'),
                             num_output=layer.seq_lengh)
            return bottom
        if layer.layer_type == 'flatten':
            bottom = cl.Flatten(bottom)
            return bottom
Ejemplo n.º 16
0
def net(split, vocab_size, opts):
    n = caffe.NetSpec()
    param_str = json.dumps({'split': split, 'batchsize': cfg.BATCHSIZE})
    n.qvec, n.cvec, n.img_feat, n.spt_feat, n.query_label, n.query_label_mask, n.query_bbox_targets, \
                n.query_bbox_inside_weights, n.query_bbox_outside_weights =  L.Python( \
                                        name='data', module='networks.data_layer', layer='DataProviderLayer', param_str=param_str, ntop=9 )

    n.embed_ba = L.Embed(n.qvec, input_dim=vocab_size, num_output=cfg.WORD_EMB_SIZE, \
                         weight_filler=dict(type='xavier'))
    n.embed = L.TanH(n.embed_ba)
    word_emb = n.embed

    # LSTM1
    n.lstm1 = L.LSTM(\
                   word_emb, n.cvec,\
                   recurrent_param=dict(\
                       num_output=cfg.RNN_DIM,\
                       weight_filler=dict(type='xavier')))

    tops1 = L.Slice(n.lstm1, ntop=cfg.QUERY_MAXLEN, slice_param={'axis': 0})
    for i in xrange(cfg.QUERY_MAXLEN - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[cfg.QUERY_MAXLEN - 1]
    n.lstm1_reshaped = L.Reshape(
        n.lstm1_out, reshape_param=dict(shape=dict(dim=[-1, cfg.RNN_DIM])))
    n.lstm1_droped = L.Dropout(
        n.lstm1_reshaped, dropout_param={'dropout_ratio': cfg.DROPOUT_RATIO})
    n.lstm_l2norm = L.L2Normalize(n.lstm1_droped)
    n.q_emb = L.Reshape(n.lstm_l2norm,
                        reshape_param=dict(shape=dict(dim=[0, -1])))
    q_layer = n.q_emb  # (N, 1024)

    v_layer = proc_img(n, n.img_feat, n.spt_feat)  #out: (N, 100, 2053)
    out_layer = concat(n, q_layer, v_layer)
    # predict score
    n.query_score_fc = L.InnerProduct(out_layer,
                                      num_output=1,
                                      weight_filler=dict(type='xavier'))
    n.query_score_pred = L.Reshape(
        n.query_score_fc,
        reshape_param=dict(shape=dict(dim=[-1, cfg.RPN_TOPN])))
    if cfg.USE_KLD:
        n.loss_query_score = L.SoftmaxKLDLoss(n.query_score_pred,
                                              n.query_label,
                                              n.query_label_mask,
                                              propagate_down=[1, 0, 0],
                                              loss_weight=1.0)
    else:
        n.loss_query_score = L.SoftmaxWithLoss(n.query_score_pred,
                                               n.query_label,
                                               n.query_label_mask,
                                               propagate_down=[1, 0, 0],
                                               loss_weight=1.0)

    # predict bbox
    n.query_bbox_pred = L.InnerProduct(out_layer,
                                       num_output=4,
                                       weight_filler=dict(type='xavier'))
    if cfg.USE_REG:
        n.loss_query_bbox = L.SmoothL1Loss( n.query_bbox_pred, n.query_bbox_targets, \
                                        n.query_bbox_inside_weights, n.query_bbox_outside_weights, loss_weight=1.0)
    else:
        n.__setattr__('silence_query_bbox_pred',
                      L.Silence(n.query_bbox_pred, ntop=0))
        n.__setattr__('silence_query_bbox_targets',
                      L.Silence(n.query_bbox_targets, ntop=0))
        n.__setattr__('silence_query_bbox_inside_weights',
                      L.Silence(n.query_bbox_inside_weights, ntop=0))
        n.__setattr__('silence_query_bbox_outside_weights',
                      L.Silence(n.query_bbox_outside_weights, ntop=0))
    return n.to_proto()
Ejemplo n.º 17
0
def qlstm(mode, batchsize, max_words_in_question, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', \
        param_str=mode_str, ntop=5 )

    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # T x N -> T x N x 300
    n.embed = L.TanH(n.embed_ba)
    concat_word_embed = [n.embed, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # T x N x 600

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops1 = L.Slice(n.lstm1,
                    ntop=max_words_in_question,
                    slice_param={'axis': 0})
    for i in xrange(max_words_in_question - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[max_words_in_question - 1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_drop = L.Dropout(n.lstm1_reshaped,
                                      dropout_param={'dropout_ratio': 0.3})
    n.lstm1_drop = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3})

    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_drop, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops2 = L.Slice(n.lstm2,
                    ntop=max_words_in_question,
                    slice_param={'axis': 0})
    for i in xrange(max_words_in_question - 1):
        n.__setattr__('slice_second' + str(i), tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(tops2[int(i)], ntop=0))
    n.lstm2_out = tops2[max_words_in_question - 1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_drop = L.Dropout(n.lstm2_reshaped,
                                      dropout_param={'dropout_ratio': 0.3})
    concat_lstms = [n.lstm1_reshaped_drop, n.lstm2_reshaped_drop]
    n.lstm_12 = L.Concat(*concat_lstms)

    n.q_emb_tanh_droped_resh = L.Reshape(n.lstm_12, \
        reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature, \
        reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh, n.i_emb_tanh_droped_resh, \
        compact_bilinear_param=dict(num_output=16000,sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)

    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})
    n.blcf_droped_resh = L.Reshape(
        n.blcf_droped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.blcf_droped_resh, num_output=config.NUM_OUTPUT_UNITS, \
        weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
Ejemplo n.º 18
0
def cnn_lstm(**args):
    n_rows = args.get('n_rows', 28)
    n_cols = args.get('n_cols', 28)
    kernel_size = args.get('kernel_size', 5)
    n_output = args.get('n_output', n_cols)
    batch_size = args.get('batch_size', 300)
    recur_steps = args.get('recur_steps', 4)
    n_classes = args.get('n_classes', 10)
    frame_shape = args.get('frame_shape', (1, n_rows / recur_steps, n_cols))

    n = caffe.NetSpec()

    weight_param = dict(lr_mult=1, decay_mult=1)
    bias_param = dict(lr_mult=1, decay_mult=0)
    param = [weight_param, bias_param]

    N = batch_size
    T = recur_steps
    input_dim = list(frame_shape)
    input_dim.insert(0, N * T)

    n.data, n.clip, n.label = L.Input(
        shape=[dict(dim=input_dim),
               dict(dim=[T, N]),
               dict(dim=[N, 1])],
        ntop=3)

    n.conv1 = L.Convolution(n.data,
                            kernel_size=[1, kernel_size],
                            stride_h=1,
                            stride_w=1,
                            num_output=n_output,
                            pad=0,
                            param=param,
                            weight_filler=dict(type='msra'),
                            bias_filler=dict(type='constant'))
    n.relu1 = L.ReLU(n.conv1, in_place=True)

    n.conv2 = L.Convolution(n.relu1,
                            kernel_size=[1, kernel_size],
                            stride_h=1,
                            stride_w=1,
                            num_output=n_output,
                            pad=0,
                            param=param,
                            weight_filler=dict(type='msra'),
                            bias_filler=dict(type='constant'))
    n.relu2 = L.ReLU(n.conv2, in_place=True)

    n.pool1 = L.Pooling(n.relu2,
                        kernel_h=1,
                        kernel_w=2,
                        stride_h=1,
                        stride_w=2,
                        pool=P.Pooling.MAX)

    n.fc1 = L.InnerProduct(n.pool1,
                           num_output=n_output,
                           param=param,
                           weight_filler=dict(type='msra'),
                           bias_filler=dict(type='constant'))

    n.fc1_relu = L.ReLU(n.fc1, in_place=True)

    n.fc1_reshape = L.Reshape(n.fc1_relu, shape=dict(dim=[T, N, n_output]))

    n.lstm = L.LSTM(n.fc1_reshape,
                    n.clip,
                    recurrent_param=dict(num_output=n_output))

    n.lstm_last_step = L.Slice(n.lstm,
                               slice_param=dict(axis=0, slice_point=T - 1),
                               ntop=2)[-1]

    n.lstm_reshape = L.Reshape(n.lstm_last_step, shape=dict(dim=[N, n_output]))

    n.attrs = L.InnerProduct(n.lstm_reshape,
                             num_output=n_classes,
                             param=param,
                             weight_filler=dict(type='msra'),
                             bias_filler=dict(type='constant'))

    n.loss = L.SoftmaxWithLoss(n.attrs, n.label)

    n.class_prob = L.Softmax(n.attrs, in_place=False)

    return n.to_proto()
Ejemplo n.º 19
0
def mfb_coatt(mode, batchsize, T, question_vocab_size, folder):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
    if mode == 'val':
        n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \
            module='vqa_data_layer_hdf5', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=5 )
    else:
        n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
            module='vqa_data_layer_kld_hdf5', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=5 ) 
    n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
                         weight_filler=dict(type='xavier'))
    n.embed_tanh = L.TanH(n.embed) 
    concat_word_embed = [n.embed_tanh, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600

    # LSTM
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=config.LSTM_UNIT_NUM,\
                       weight_filler=dict(type='xavier')))
    n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    n.lstm1_resh = L.Permute(n.lstm1_droped, permute_param=dict(order=[1,2,0]))
    n.lstm1_resh2 = L.Reshape(n.lstm1_resh, \
            reshape_param=dict(shape=dict(dim=[0,0,0,1])))

    '''
    Question Attention
    '''
    n.qatt_conv1 = L.Convolution(n.lstm1_resh2, kernel_size=1, stride=1, num_output=512, pad=0,
                                           weight_filler=dict(type='xavier'))
    n.qatt_relu = L.ReLU(n.qatt_conv1)
    n.qatt_conv2 = L.Convolution(n.qatt_relu, kernel_size=1, stride=1, num_output=config.NUM_QUESTION_GLIMPSE, pad=0,
                                           weight_filler=dict(type='xavier')) 
    n.qatt_reshape = L.Reshape(n.qatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_QUESTION_GLIMPSE,config.MAX_WORDS_IN_QUESTION,1]))) # N*NUM_QUESTION_GLIMPSE*15
    n.qatt_softmax = L.Softmax(n.qatt_reshape, axis=2)

    qatt_maps = L.Slice(n.qatt_softmax,ntop=config.NUM_QUESTION_GLIMPSE,slice_param={'axis':1})
    dummy_lstm = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    qatt_feature_list = []
    for i in xrange(config.NUM_QUESTION_GLIMPSE):
        if config.NUM_QUESTION_GLIMPSE == 1:
            n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps, dummy_lstm))
        else:
            n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps[i], dummy_lstm))    
        qatt_feature_list.append(n.__getattr__('qatt_feat%d'%i))
    n.qatt_feat_concat = L.Concat(*qatt_feature_list) 
    '''
    Image Attention with MFB
    '''
    n.q_feat_resh = L.Reshape(n.qatt_feat_concat,reshape_param=dict(shape=dict(dim=[0,-1,1,1])))
    n.i_feat_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[0,-1,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
    
    n.iatt_q_proj = L.InnerProduct(n.q_feat_resh, num_output = config.JOINT_EMB_SIZE, 
                                   weight_filler=dict(type='xavier'))
    n.iatt_q_resh = L.Reshape(n.iatt_q_proj, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,1,1])))  
    n.iatt_q_tile1 = L.Tile(n.iatt_q_resh, axis=2, tiles=config.IMG_FEAT_WIDTH)
    n.iatt_q_tile2 = L.Tile(n.iatt_q_tile1, axis=3, tiles=config.IMG_FEAT_WIDTH)


    n.iatt_i_conv = L.Convolution(n.i_feat_resh, kernel_size=1, stride=1, num_output=config.JOINT_EMB_SIZE, pad=0,
                                 weight_filler=dict(type='xavier')) 
    n.iatt_i_resh1 = L.Reshape(n.iatt_i_conv, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,
                                                                      config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
    n.iatt_iq_eltwise = L.Eltwise(n.iatt_q_tile2, n.iatt_i_resh1, eltwise_param=dict(operation=0))
    n.iatt_iq_droped = L.Dropout(n.iatt_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_droped, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,config.IMG_FEAT_SIZE,1])))
    n.iatt_iq_permute1 = L.Permute(n.iatt_iq_resh2, permute_param=dict(order=[0,2,1,3]))
    n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_permute1, reshape_param=dict(shape=dict(dim=[-1,config.IMG_FEAT_SIZE,
                                                                       config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.iatt_iq_sumpool = L.Pooling(n.iatt_iq_resh2, pool=P.Pooling.SUM, \
                              pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.iatt_iq_permute2 = L.Permute(n.iatt_iq_sumpool, permute_param=dict(order=[0,2,1,3]))
    
    n.iatt_iq_sqrt = L.SignedSqrt(n.iatt_iq_permute2)
    n.iatt_iq_l2 = L.L2Normalize(n.iatt_iq_sqrt)


    ## 2 conv layers 1000 -> 512 -> 2
    n.iatt_conv1 = L.Convolution(n.iatt_iq_l2, kernel_size=1, stride=1, num_output=512, pad=0, 
                                weight_filler=dict(type='xavier'))
    n.iatt_relu = L.ReLU(n.iatt_conv1)
    n.iatt_conv2 = L.Convolution(n.iatt_relu, kernel_size=1, stride=1, num_output=config.NUM_IMG_GLIMPSE, pad=0,
                                           weight_filler=dict(type='xavier')) 
    n.iatt_resh = L.Reshape(n.iatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_SIZE])))
    n.iatt_softmax = L.Softmax(n.iatt_resh, axis=2)
    n.iatt_softmax_resh = L.Reshape(n.iatt_softmax,reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
    iatt_maps = L.Slice(n.iatt_softmax_resh, ntop=config.NUM_IMG_GLIMPSE,slice_param={'axis':1})
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    iatt_feature_list = []
    for i in xrange(config.NUM_IMG_GLIMPSE):
        if config.NUM_IMG_GLIMPSE == 1:
            n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps, dummy))
        else:
            n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps[i], dummy))
        n.__setattr__('iatt_feat%d_resh'%i, L.Reshape(n.__getattr__('iatt_feat%d'%i), \
                                reshape_param=dict(shape=dict(dim=[0,-1]))))
        iatt_feature_list.append(n.__getattr__('iatt_feat%d_resh'%i))
    n.iatt_feat_concat = L.Concat(*iatt_feature_list)
    n.iatt_feat_concat_resh = L.Reshape(n.iatt_feat_concat, reshape_param=dict(shape=dict(dim=[0,-1,1,1])))
    
    '''
    Fine-grained Image-Question MFB fusion
    '''

    n.mfb_q_proj = L.InnerProduct(n.q_feat_resh, num_output=config.JOINT_EMB_SIZE, 
                                  weight_filler=dict(type='xavier'))
    n.mfb_i_proj = L.InnerProduct(n.iatt_feat_concat_resh, num_output=config.JOINT_EMB_SIZE, 
                                  weight_filler=dict(type='xavier'))
    n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0))
    n.mfb_iq_drop = L.Dropout(n.mfb_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.mfb_iq_resh = L.Reshape(n.mfb_iq_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \
                                      pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.mfb_out = L.Reshape(n.mfb_iq_sumpool,\
                                    reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
    n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out)
    n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt) 
    
    n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS,
                                  weight_filler=dict(type='xavier')) 
    if mode == 'val':
        n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    else:
        n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 
    return n.to_proto()
Ejemplo n.º 20
0
def qlstm(mode, batchsize, T, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )

    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    n.embed = L.TanH(n.embed_ba)
    concat_word_embed = [n.embed, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # T x N x 600

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[T - 1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_second' + str(i), tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(tops2[int(i)], ntop=0))
    n.lstm2_out = tops2[T - 1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    n.q_emb_tanh_droped_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    n.lstm_12_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.lstm_12_resh,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
Ejemplo n.º 21
0
def generate_model(split, config):
    n = caffe.NetSpec()
    batch_size = config.N
    mode_str = str(dict(split=split, batch_size=batch_size))
    n.language, n.cont, n.image, n.spatial, n.label = L.Python(module=config.data_provider,
                                                               layer=config.data_provider_layer,
                                                               param_str=mode_str,
                                                               ntop=5)

    # the base net (VGG-16)
    n.conv1_1, n.relu1_1 = conv_relu(n.image, 64,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool1 = max_pool(n.relu1_2)

    n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool2 = max_pool(n.relu2_2)

    n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool3 = max_pool(n.relu3_3)

    n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool4 = max_pool(n.relu4_3)

    n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool5 = max_pool(n.relu5_3)

    # fully conv
    n.fcn_fc6, n.fcn_relu6 = conv_relu(n.pool5, 4096, ks=7, pad=3)
    if config.vgg_dropout:
        n.fcn_drop6 = L.Dropout(n.fcn_relu6, dropout_ratio=0.5, in_place=True)
        n.fcn_fc7, n.fcn_relu7 = conv_relu(n.fcn_drop6, 4096, ks=1, pad=0)
        n.fcn_drop7 = L.Dropout(n.fcn_relu7, dropout_ratio=0.5, in_place=True)
        n.fcn_fc8 = conv(n.fcn_drop7, 1000, ks=1, pad=0)
    else:
        n.fcn_fc7, n.fcn_relu7 = conv_relu(n.fcn_relu6, 4096, ks=1, pad=0)
        n.fcn_fc8 = conv(n.fcn_relu7, 1000, ks=1, pad=0)

    
    # embedding
    n.embed = L.Embed(n.language, input_dim=config.vocab_size,
                      num_output=config.embed_dim,
                      weight_filler=dict(type='uniform', min=-0.08, max=0.08))

    # LSTM
    n.lstm = L.LSTM(n.embed, n.cont,
                    recurrent_param=dict(num_output=config.lstm_dim,
                                         weight_filler=dict(type='uniform', min=-0.08, max=0.08),
                                         bias_filler=dict(type='constant', value=0)))
    tops = L.Slice(n.lstm, ntop=config.T, slice_param=dict(axis=0))
    for i in range(config.T - 1):
        n.__setattr__('slice'+str(i), tops[i])
        n.__setattr__('silence'+str(i), L.Silence(tops[i], ntop=0))
    n.lstm_out = tops[-1]
    n.lstm_feat = L.Reshape(n.lstm_out, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim])))

    # Tile LSTM feature
    n.lstm_resh = L.Reshape(n.lstm_feat, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim, 1, 1])))
    n.lstm_tile_1 = L.Tile(n.lstm_resh, axis=2, tiles=config.featmap_H)
    n.lstm_tile_2 = L.Tile(n.lstm_tile_1, axis=3, tiles=config.featmap_W)

    # L2 Normalize image and language features
    n.img_l2norm = L.L2Normalize(n.fcn_fc8)
    n.lstm_l2norm = L.L2Normalize(n.lstm_tile_2)

    # Concatenate
    n.feat_all = L.Concat(n.lstm_l2norm, n.img_l2norm, n.spatial, concat_param=dict(axis=1))

    # MLP Classifier over concatenated feature
    n.fcn_l1, n.fcn_relu1 = conv_relu(n.feat_all, config.mlp_hidden_dims, ks=1, pad=0)
    if config.mlp_dropout:
        n.fcn_drop1 = L.Dropout(n.fcn_relu1, dropout_ratio=0.5, in_place=True)
        n.fcn_scores = conv(n.fcn_drop1, 1, ks=1, pad=0)
    else:
        n.fcn_scores = conv(n.fcn_relu1, 1, ks=1, pad=0)
    
    # Loss Layer
    n.loss = L.SigmoidCrossEntropyLoss(n.fcn_scores, n.label)

    return n.to_proto()