Esempio n. 1
0
class SceneMlp(object):
    """
    multi-layer perceptron used to predict scene-specific context
    """
    def __init__(self, name='scene_mlp', layer_sizes=(2048, 1024, 1024, 80), model_file=None):
        self.name = name
        if model_file is not None:
            with h5py.File(model_file, 'r') as f:
                layer_sizes = f.attrs['layer_sizes']
        self.config = {'layer_sizes': layer_sizes}

        # define inputs
        x = T.matrix('x')
        y = T.matrix('y')
        self.inputs = [x, y]

        # define computation graph
        self.mlp = MLP(layer_sizes=layer_sizes, name='mlp', output_type='softmax')
        self.proba = self.mlp.compute(x)
        self.log_proba = T.log(self.proba)

        # define costs
        def kl_divergence(p, q):
            kl = T.mean(T.sum(p * T.log((p+1e-30)/(q+1e-30)), axis=1))
            kl += T.mean(T.sum(q * T.log((q+1e-30)/(p+1e-30)), axis=1))
            return kl
        kl = kl_divergence(self.proba, y)
        acc = T.mean(T.eq(self.proba.argmax(axis=1), y.argmax(axis=1)))
        self.costs = [kl, acc]

        # layers and parameters
        self.layers = [self.mlp]
        self.params = sum([l.params for l in self.layers], [])

        # load weights from file, if model_file is not None
        if model_file is not None:
            self.load_weights(model_file)

    def save_to_dir(self, save_dir, idx='0'):
        save_file = osp.join(save_dir, self.name+'.h5.' + str(idx))
        for l in self.layers:
            l.save_weights(save_file)
        with h5py.File(save_file) as f:
            for k, v in self.config.items():
                f.attrs[k] = v

    def load_weights(self, model_file):
        for l in self.layers:
            l.load_weights(model_file)
Esempio n. 2
0
class Model(object):
    """
    Region Attention model
    """
    def __init__(self, name='ra', nimg=2048, nnh=512, na=512, nh=512, nw=512, nout=8843, npatch=30, model_file=None):
        self.name = name
        if model_file is not None:
            with h5py.File(model_file, 'r') as f:
                nimg = f.attrs['nimg']
                nnh = f.attrs['nnh']
                na = f.attrs['na']
                nh = f.attrs['nh']
                nw = f.attrs['nw']
                nout = f.attrs['nout']
                # npatch = f.attrs['npatch']
        self.config = {'nimg': nimg, 'nnh': nnh, 'na': na, 'nh': nh, 'nw': nw, 'nout': nout, 'npatch': npatch}

        # word embedding layer
        self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding')

        # initialization mlp layer
        self.init_mlp = MLP(layer_sizes=[na, 2*nh], output_type='tanh', name=self.name+'@init_mlp')
        self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name+'@proj_mlp')

        # lstm
        self.lstm = BasicLSTM(dim_x=na+nw, dim_h=nh, name=self.name+'@lstm')

        # prediction mlp
        self.pred_mlp = MLP(layer_sizes=[na+nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp')

        # attention layer
        self.attention = Attention(dim_item=na, dim_context=na+nw+nh, hsize=nnh, name=self.name+'@attention')

        # inputs
        cap = T.imatrix('cap')
        img = T.tensor3('img')
        self.inputs = [cap, img]

        # go through sequence
        feat = self.proj_mlp.compute(img)
        init_e = feat.mean(axis=1)
        init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1)
        (state, self.p, loss, self.alpha), _ = theano.scan(fn=self.scan_func,
                                                           sequences=[cap[0:-1, :], cap[1:, :]],
                                                           outputs_info=[init_state, None, None, None],
                                                           non_sequences=[feat])

        # loss function
        loss = T.mean(loss)
        self.costs = [loss]

        # layers and parameters
        self.layers = [self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp]
        self.params = sum([l.params for l in self.layers], [])

        # load weights from file, if model_file is not None
        if model_file is not None:
            self.load_weights(model_file)

        # these functions and variables are used in test stage
        self._init_func = None
        self._step_func = None
        self._proj_func = None
        self._feat_shared = theano.shared(np.zeros((1, npatch, nimg)).astype(theano.config.floatX))

    def compute(self, state, w_idx, feat):
        # word embedding
        word_vec = self.embedding.compute(w_idx)
        # split states
        e_tm1, c_tm1, h_tm1 = split_state(state, scheme=[(1, self.config['na']), (2, self.config['nh'])])
        # attention
        e_t, alpha = self.attention.compute(feat, T.concatenate([e_tm1, h_tm1, word_vec], axis=1))
        # lstm step
        e_w = T.concatenate([e_t, word_vec], axis=-1)
        c_t, h_t = self.lstm.compute(e_w, c_tm1, h_tm1)  # (mb,nh)
        # merge state
        new_state = T.concatenate([e_t, c_t, h_t], axis=-1)
        # predict word probability
        p = self.pred_mlp.compute(T.concatenate([e_t, h_t, word_vec], axis=-1))
        return new_state, p, alpha

    def scan_func(self, w_tm1, w_t, state, feat):
        # update state
        new_state, p, alpha = self.compute(state, w_tm1, feat)
        # cross-entropy loss
        loss = T.nnet.categorical_crossentropy(p, w_t)
        return new_state, p, loss, alpha

    def init_func(self, img_value):
        if self._proj_func is None:
            img = T.tensor3()
            self._proj_func = theano.function([img], self.proj_mlp.compute(img))
        if self._init_func is None:
            init_e = self._feat_shared.mean(axis=1)
            init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1)
            self._init_func = theano.function([], init_state)
        self._feat_shared.set_value(self._proj_func(img_value))
        return self._init_func()

    def step_func(self, state_value, w_value):
        if self._step_func is None:
            w = T.ivector()
            state = T.matrix()
            new_state, p, _ = self.compute(state, w, self._feat_shared)
            self._step_func = theano.function([state, w], [new_state, T.log(p)])
        return self._step_func(state_value, w_value)

    def save_to_dir(self, save_dir, idx):
        save_file = osp.join(save_dir, self.name+'.h5.'+str(idx))
        for l in self.layers:
            l.save_weights(save_file)
        with h5py.File(save_file) as f:
            for k, v in self.config.items():
                f.attrs[k] = v

    def load_weights(self, model_file):
        for l in self.layers:
            l.load_weights(model_file)
Esempio n. 3
0
class Model(object):
    """
    scene-specific contexts
    """
    def __init__(self, name='ss', nimg=2048, nh=512, nw=512, nout=8843, ns=80, model_file=None):
        self.name = name
        if model_file is not None:
            with h5py.File(model_file, 'r') as f:
                nimg = f.attrs['nimg']
                nh = f.attrs['nh']
                nw = f.attrs['nw']
                ns = f.attrs['ns']
                nout = f.attrs['nout']
        self.config = {'nimg': nimg, 'nh': nh, 'nw': nw, 'nout': nout, 'ns': ns}

        # word embedding layer
        self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding')

        # initialization mlp layer
        self.proj_mlp = MLP(layer_sizes=[nimg, 2*nh], output_type='tanh', name=self.name+'@proj_mlp')

        # lstm
        self.lstm = BasicLSTM(dim_x=nw+ns, dim_h=nh, name=self.name+'@lstm')

        # prediction mlp
        self.pred_mlp = MLP(layer_sizes=[nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp')

        # inputs
        cap = T.imatrix('cap')
        img = T.matrix('img')
        scene = T.matrix('scene')
        self.inputs = [cap, img, scene]

        # go through sequence
        init_state = self.proj_mlp.compute(img)
        (state, self.p, loss), _ = theano.scan(fn=self.scan_func,
                                               sequences=[cap[0:-1, :], cap[1:, :]],
                                               outputs_info=[init_state, None, None],
                                               non_sequences=[scene])

        # loss function
        loss = T.mean(loss)
        self.costs = [loss]

        # layers and parameters
        self.layers = [self.embedding, self.proj_mlp, self.lstm, self.pred_mlp]
        self.params = sum([l.params for l in self.layers], [])

        # load weights from file, if model_file is not None
        if model_file is not None:
            self.load_weights(model_file)

        # initialization for test stage
        self._init_func = None
        self._step_func = None
        self._scene_shared = theano.shared(np.zeros((1, ns)).astype(theano.config.floatX))

    def compute(self, state, w_idx, scene):
        # word embedding
        word_vec = self.embedding.compute(w_idx)
        # split states
        c_tm1, h_tm1 = split_state(state, scheme=[(2, self.config['nh'])])
        # lstm step
        w_s = T.concatenate([word_vec, scene], axis=1)
        c_t, h_t = self.lstm.compute(w_s, c_tm1, h_tm1)
        # merge state
        new_state = T.concatenate([c_t, h_t], axis=-1)
        # add w_{t-1} as feature
        h_and_w = T.concatenate([h_t, word_vec], axis=-1)
        # predict probability
        p = self.pred_mlp.compute(h_and_w)
        return new_state, p

    def scan_func(self, w_tm1, w_t, state, scene):
        # update state
        new_state, p = self.compute(state, w_tm1, scene)
        # cross-entropy loss
        loss = T.nnet.categorical_crossentropy(p, w_t)
        return new_state, p, loss

    def init_func(self, img_value, scene_value):
        if self._init_func is None:
            img = T.matrix()
            init_state = self.proj_mlp.compute(img)
            self._init_func = theano.function([img], init_state)
        self._scene_shared.set_value(scene_value)
        return self._init_func(img_value)

    def step_func(self, state_value, w_value):
        if self._step_func is None:
            w = T.ivector()
            state = T.matrix()
            new_state, p = self.compute(state, w, self._scene_shared)
            self._step_func = theano.function([state, w], [new_state, T.log(p)])
        return self._step_func(state_value, w_value)

    def save_to_dir(self, save_dir, idx):
        save_file = osp.join(save_dir, self.name+'.h5.'+str(idx))
        for l in self.layers:
            l.save_weights(save_file)
        with h5py.File(save_file) as f:
            for k, v in self.config.items():
                f.attrs[k] = v

    def load_weights(self, model_file):
        for l in self.layers:
            l.load_weights(model_file)
class Model(object):
    """
    an re-implementation of google NIC system, used as the baseline in our paper
    """
    def __init__(self,
                 name='gnic',
                 nimg=2048,
                 nh=512,
                 nw=512,
                 nout=8843,
                 model_file=None):
        self.name = name
        if model_file is not None:
            with h5py.File(model_file, 'r') as f:
                nimg = f.attrs['nimg']
                nh = f.attrs['nh']
                nw = f.attrs['nw']
                nout = f.attrs['nout']
        self.config = {'nimg': nimg, 'nh': nh, 'nw': nw, 'nout': nout}

        # word embedding layer
        self.embedding = Embedding(n_emb=nout,
                                   dim_emb=nw,
                                   name=self.name + '@embedding')

        # initialization mlp layer
        self.proj_mlp = MLP(layer_sizes=[nimg, 2 * nh],
                            output_type='tanh',
                            name=self.name + '@proj_mlp')

        # lstm
        self.lstm = BasicLSTM(dim_x=nw, dim_h=nh, name=self.name + '@lstm')

        # prediction mlp
        self.pred_mlp = MLP(layer_sizes=[nh + nw, nout],
                            output_type='softmax',
                            name=self.name + '@pred_mlp')

        # inputs
        cap = T.imatrix('cap')
        img = T.matrix('img')
        self.inputs = [cap, img]

        # go through sequence
        init_state = self.proj_mlp.compute(img)
        (state, self.p,
         loss), _ = theano.scan(fn=self.scan_func,
                                sequences=[cap[0:-1, :], cap[1:, :]],
                                outputs_info=[init_state, None, None])

        # loss function
        loss = T.mean(loss)
        self.costs = [loss]

        # layers and parameters
        self.layers = [self.embedding, self.proj_mlp, self.lstm, self.pred_mlp]
        self.params = sum([l.params for l in self.layers], [])

        # load weights from file, if model_file is not None
        if model_file is not None:
            self.load_weights(model_file)

        # these functions are used in test stage
        self._init_func = None
        self._step_func = None

    def compute(self, state, w_idx):
        # word embedding
        word_vec = self.embedding.compute(w_idx)
        # split states
        c_tm1, h_tm1 = split_state(state, scheme=[(2, self.config['nh'])])
        # lstm step
        c_t, h_t = self.lstm.compute(word_vec, c_tm1, h_tm1)
        # merge state
        new_state = T.concatenate([c_t, h_t], axis=-1)
        # add w_{t-1} as feature
        h_and_w = T.concatenate([h_t, word_vec], axis=-1)
        # predict probability
        p = self.pred_mlp.compute(h_and_w)
        return new_state, p

    def scan_func(self, w_tm1, w_t, state):
        # update state
        new_state, p = self.compute(state, w_tm1)
        # cross-entropy loss
        loss = T.nnet.categorical_crossentropy(p, w_t)
        return new_state, p, loss

    def init_func(self, img_value):
        if self._init_func is None:
            img = T.matrix()
            init_state = self.proj_mlp.compute(img)
            self._init_func = theano.function([img], init_state)
        return self._init_func(img_value)

    def step_func(self, state_value, w_value):
        if self._step_func is None:
            w = T.ivector()
            state = T.matrix()
            new_state, p = self.compute(state, w)
            self._step_func = theano.function([state, w],
                                              [new_state, T.log(p)])
        return self._step_func(state_value, w_value)

    def save_to_dir(self, save_dir, idx):
        save_file = osp.join(save_dir, self.name + '.h5.' + str(idx))
        for l in self.layers:
            l.save_weights(save_file)
        with h5py.File(save_file) as f:
            for k, v in self.config.items():
                f.attrs[k] = v

    def load_weights(self, model_file):
        for l in self.layers:
            l.load_weights(model_file)
Esempio n. 5
0
class Model(object):
    """
    Region Attention model
    """
    def __init__(self, name='ra', nimg=2048, na=512, nh=512, nw=512, nout=8843, npatch=30, model_file=None):
        self.name = name
        if model_file is not None:
            with h5py.File(model_file, 'r') as f:
                nimg = f.attrs['nimg']
                na = f.attrs['na']
                nh = f.attrs['nh']
                nw = f.attrs['nw']
                nout = f.attrs['nout']
                # npatch = f.attrs['npatch']
        self.config = {'nimg': nimg, 'na': na, 'nh': nh, 'nw': nw, 'nout': nout, 'npatch': npatch}

        # word embedding layer
        self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding')

        # initialization mlp layer
        self.init_mlp = MLP(layer_sizes=[na, 2*nh], output_type='tanh', name=self.name+'@init_mlp')
        self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name+'@proj_mlp')

        # lstm
        self.lstm = BasicLSTM(dim_x=na+nw, dim_h=nh, name=self.name+'@lstm')

        # prediction mlp
        self.pred_mlp = MLP(layer_sizes=[na+nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp')

        # attention layer
        self.attention = Attention(dim_item=na, dim_context=na+nw+nh, hsize=nh, name=self.name+'@attention')

        # inputs
        cap = T.imatrix('cap')
        img = T.tensor3('img')
        self.inputs = [cap, img]

        # go through sequence
        feat = self.proj_mlp.compute(img)
        init_e = feat.mean(axis=1)
        init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1)
        (state, self.p, loss, self.alpha), _ = theano.scan(fn=self.scan_func,
                                                           sequences=[cap[0:-1, :], cap[1:, :]],
                                                           outputs_info=[init_state, None, None, None],
                                                           non_sequences=[feat])

        # loss function
        loss = T.mean(loss)
        self.costs = [loss]

        # layers and parameters
        self.layers = [self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp]
        self.params = sum([l.params for l in self.layers], [])

        # load weights from file, if model_file is not None
        if model_file is not None:
            self.load_weights(model_file)

        # these functions and variables are used in test stage
        self._init_func = None
        self._step_func = None
        self._proj_func = None
        self._feat_shared = theano.shared(np.zeros((1, npatch, na)).astype(theano.config.floatX))

    def compute(self, state, w_idx, feat):
        # word embedding
        word_vec = self.embedding.compute(w_idx)
        # split states
        e_tm1, c_tm1, h_tm1 = split_state(state, scheme=[(1, self.config['na']), (2, self.config['nh'])])
        # attention
        e_t, alpha = self.attention.compute(feat, T.concatenate([e_tm1, h_tm1, word_vec], axis=1))
        # lstm step
        e_w = T.concatenate([e_t, word_vec], axis=-1)
        c_t, h_t = self.lstm.compute(e_w, c_tm1, h_tm1)  # (mb,nh)
        # merge state
        new_state = T.concatenate([e_t, c_t, h_t], axis=-1)
        # predict word probability
        p = self.pred_mlp.compute(T.concatenate([e_t, h_t, word_vec], axis=-1))
        return new_state, p, alpha

    def scan_func(self, w_tm1, w_t, state, feat):
        # update state
        new_state, p, alpha = self.compute(state, w_tm1, feat)
        # cross-entropy loss
        loss = T.nnet.categorical_crossentropy(p, w_t)
        return new_state, p, loss, alpha

    def init_func(self, img_value):
        if self._proj_func is None:
            img = T.tensor3()
            self._proj_func = theano.function([img], self.proj_mlp.compute(img))
        if self._init_func is None:
            init_e = self._feat_shared.mean(axis=1)
            init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1)
            self._init_func = theano.function([], init_state)
        self._feat_shared.set_value(self._proj_func(img_value))
        return self._init_func()

    def step_func(self, state_value, w_value):
        if self._step_func is None:
            w = T.ivector()
            state = T.matrix()
            new_state, p, _ = self.compute(state, w, self._feat_shared)
            self._step_func = theano.function([state, w], [new_state, T.log(p)])
        return self._step_func(state_value, w_value)

    def save_to_dir(self, save_dir, idx):
        save_file = osp.join(save_dir, self.name+'.h5.'+str(idx))
        for l in self.layers:
            l.save_weights(save_file)
        with h5py.File(save_file) as f:
            for k, v in self.config.items():
                f.attrs[k] = v

    def load_weights(self, model_file):
        for l in self.layers:
            l.load_weights(model_file)