Example #1
0
    def fprop(self,
              state_before,
              mem_before,
              cell_before,
              forget_below,
              input_below,
              output_below,
              state_below):

        state_fork_outs = self.state_before_fork_layer.fprop(state_before)
        mem_fork_outs = self.mem_before_fork_layer.fprop(mem_before)

        inp = Sigmoid(input_below + mem_fork_outs[self.mbf_names[1]] + \
                state_fork_outs[self.sbf_names[1]])

        output = Sigmoid(output_below + mem_fork_outs[self.mbf_names[2]] + \
                state_fork_outs[self.sbf_names[2]])

        forget = Sigmoid(forget_below + mem_fork_outs[self.mbf_names[0]] + \
                state_fork_outs[self.sbf_names[0]])

        cell = Tanh(state_below + mem_fork_outs[self.mbf_names[3]] +
                state_fork_outs[self.sbf_names[3]])

        c_t = inp * cell + forget * cell_before
        h_t = output * self.activ(c_t)

        return h_t, c_t
Example #2
0
def build_attention(tparams,
                    options,
                    desc,
                    desc_mask,
                    dlen,
                    q,
                    q_mask=None,
                    sfx=None,
                    name=None):

    if desc.ndim != desc_mask.ndim:
        desc_mask_ = desc_mask.dimshuffle(0, 1, 'x')

    assert desc.ndim == desc_mask_.ndim

    if q_mask is not None:
        assert q.ndim == q_mask.ndim
        q *= q_mask

    masked_desc = desc * desc_mask_

    desc_in = desc.reshape((-1, desc.shape[-1]))
    projd = get_layer('ff')[1](tparams=tparams,
                               state_below=desc_in,
                               options=options,
                               prefix='ff_att_ctx',
                               activ='Linear')

    projq = get_layer('ff')[1](tparams, q,
                               options,
                               prefix='ff_att_q',
                               use_bias=False,
                               activ='Linear')

    """
    Unnormalized dist metric between the rep of desc and q.
    """
    sim_vals = 0
    if options['use_dq_sims']:
        q_proj = dot(q, tparams['ff_att_bi_dq'])
        desc_proj = dot(masked_desc,
                        tparams['ff_att_bi_dq']).reshape((masked_desc.shape[0],
                        masked_desc.shape[1], -1))
        sim_vals = (desc_proj * q_proj.dimshuffle('x', 0, 1)).sum(-1)
        sim_vals = sim_vals.dimshuffle(0, 1, 'x')

    projd = projd.reshape((masked_desc.shape[0], masked_desc.shape[1], -1))

    #Intermediate layer for annotation values.
    proj_att = Tanh(projd + projq.dimshuffle('x', 0, 1) + sim_vals)
    W_proj = tparams['ff_att_proj'].dimshuffle('x', 'x', 0)
    dot_proj = (W_proj * proj_att).sum(-1)
    pre_softmax = dot_proj
    alphas = Masked_Softmax(pre_softmax, mask=desc_mask, ax=0).dimshuffle(0, 1, 'x')
    ctx = (masked_desc * alphas).sum(0)

    return ctx, alphas
Example #3
0
    def _step_slice(mask,
                    sbelow,
                    sbelowx,
                    xc_, sbefore,
                    ctx_, alpha_,
                    pctx_, cc_,
                    U, Wc,
                    Wd_att, U_att,
                    c_tt, Ux, Wcx):
        # attention
        pstate_ = dot(sbefore, Wd_att)
        pctx__ = pctx_ + pstate_[None, :, :]
        pctx__ += xc_
        pctx__ = Tanh(pctx__)
        alpha = dot(pctx__, U_att)+c_tt
        alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
        alpha = tensor.exp(alpha)
        if context_mask:
            alpha = alpha * context_mask

        alpha = alpha / alpha.sum(0, keepdims=True)
        ctx_ = (cc_ * alpha[:, :, None]).sum(0)
        # current context

        preact = dot(sbefore, U)
        preact += sbelow
        preact += dot(ctx_, Wc)
        preact = Sigmoid(preact)

        r = _slice(preact, 0, dim)
        u = _slice(preact, 1, dim)

        preactx = dot(sbefore, Ux)
        preactx *= r
        preactx += sbelowx
        preactx += dot(ctx_, Wcx)

        h = Tanh(preactx)

        h = u * sbefore + (1. - u) * h
        h = mask[:, None] * h + (1. - mask)[:, None] * sbefore

        return h, ctx_, alpha.T
Example #4
0
    def _step(mask, sbelow, sbefore, cell_before):
        preact = dot(sbefore, param('U'))
        preact += sbelow
        preact += tparams[prfx(prefix, 'b')]

        f = Sigmoid(_slice(preact, 0, dim))
        o = Sigmoid(_slice(preact, 1, dim))
        c = Tanh(_slice(preact, 2, dim))

        c = f * cell_before + (1 - f) * c
        c = mask * c + (1. - mask) * cell_before
        h = o * tensor.tanh(c)
        h = mask * h + (1. - mask) * sbefore

        return h, c
Example #5
0
    def _step(mask, sbelow, sbefore, cell_before, *args):
        preact = dot(sbefore, param('U'))
        preact += sbelow
        preact += param('b')

        i = Sigmoid(_slice(preact, 0, dim))
        f = Sigmoid(_slice(preact, 1, dim))
        o = Sigmoid(_slice(preact, 2, dim))
        c = Tanh(_slice(preact, 3, dim))

        c = f * cell_before + i * c
        c = mask * c + (1. - mask) * cell_before
        h = o * tensor.tanh(c)
        h = mask * h + (1. - mask) * sbefore

        return h, c
Example #6
0
    def _step_slice(mask, sbelow, sbelowx, sbefore, U, Ux):
        preact = dot(sbefore, U)
        preact += sbelow

        r = Sigmoid(_slice(preact, 0, dim))
        u = Sigmoid(_slice(preact, 1, dim))

        preactx = dot(r * sbefore, Ux)

        # preactx = preactx
        preactx = preactx + sbelowx

        h = Tanh(preactx)

        h = u * sbefore + (1. - u) * h
        h = mask[:, None] * h + (1. - mask)[:, None] * sbefore

        return h
Example #7
0
def build_model(tparams,
                options,
                prepare_data_fn,
                valid=None,
                cost_mask=None):

    opt_ret = dict()

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples, description:
    if options['use_sent_reps']:
        x = tensor.tensor3('desc', dtype='uint32')
        word_mask = tensor.tensor3('desc_mask', dtype='float32')
        sent_mask = tensor.cast(word_mask.sum(0) > 0, "float32")
        slen = tensor.scalar('slen', dtype='uint32')
    else:
        x = tensor.matrix('desc', dtype="uint32")
        word_mask = tensor.matrix('desc_mask', dtype='float32')

    q = tensor.matrix('q', dtype="uint32")
    q_mask = tensor.matrix('q_mask', dtype="float32")
    y = tensor.vector('ans', dtype='uint32')
    em = tensor.matrix('entity_mask', dtype="float32")

    wlen = tensor.scalar('wlen', dtype='uint32')
    qlen = tensor.scalar('qlen', dtype='uint32')
    if options['debug']:
        if valid.done:
            valid.reset()

        valid_d = next(valid)
        d_, q_, a_, em_ = valid_d[0], valid_d[1], valid_d[2], valid_d[3]

        if options['use_sent_reps']:
            d_, d_mask_, q_, q_mask_, wlen_, slen_, qlen_ = prepare_data_fn(d_, q_)
        else:
            d_, d_mask_, q_, q_mask_, wlen_, qlen_ = prepare_data_fn(d_, q_)

        print "Debugging is enabled."

        theano.config.compute_test_value = 'warn'
        x.tag.test_value = numpy.array(d_).astype("uint32")
        word_mask.tag.test_value = numpy.array(d_mask_).astype("float32")
        q.tag.test_value = numpy.array(q_).astype("uint32")
        q_mask.tag.test_value = numpy.array(q_mask_).astype("float32")
        y.tag.test_value = numpy.array(a_).astype("uint32")
        em.tag.test_value = numpy.array(em_).astype("float32")
        wlen.tag.test_value = numpy.array(wlen_).astype("uint32")
        qlen.tag.test_value = numpy.array(qlen_).astype("uint32")

        if options['use_sent_reps']:
            slen.tag.test_value = numpy.array(slen_).astype("uint32")
            sent_mask.tag.test_value = numpy.array(d_mask_.sum(0) > 0, dtype="float32")

    if x.ndim == 3:
        x_rshp = x.reshape((x.shape[0], x.shape[1]*x.shape[2]))
    else:
        x_rshp = x

    """
        Bidirectional for the description.
    """
    if options['use_bidir']:
        proj_wx, proj_wxr = build_bidir_model(x_rshp,
                                              word_mask,
                                              tparams,
                                              options,
                                              sfx="word",
                                              nsteps=wlen,
                                              truncate=options['truncate'],
                                              use_dropout=options['use_dropout'],
                                              use_noise=use_noise,
                                              name="encoder_desc_word")

        desc_wrep = concatenate([proj_wx[0],
                                proj_wxr[0][::-1]],
                                axis=-1)
    else:
        proj_wx = build_nonbidir_model(x_rshp,
                                       word_mask,
                                       tparams,
                                       options,
                                       sfx="word",
                                       nsteps=wlen,
                                       truncate=options['truncate'],
                                       use_dropout=options['use_dropout'],
                                       use_noise=use_noise,
                                       name="encoder_desc_word")
        desc_wrep = proj_wx

    if options['use_bidir']:
        if options['use_sent_reps']:
            desc_wrep = desc_wrep.reshape((x.shape[0],
                                           x.shape[1],
                                           x.shape[2],
                                           -1))

            mean_desc_wrep = ((desc_wrep * word_mask.dimshuffle(0, 1, 2, 'x')).sum(0) /
                (word_mask.sum(0).dimshuffle(0, 1, 'x') + 1e-8))

            proj_sx, proj_sxr = build_bidir_model(mean_desc_wrep,
                                                  sent_mask,
                                                  tparams,
                                                  options,
                                                  sfx="sent",
                                                  nsteps=slen,
                                                  truncate=options['truncate'],
                                                  name="encoder_desc_sent")

            proj_x, proj_xr = proj_sx, proj_sxr
            desc_mask = sent_mask.dimshuffle(0, 1, 'x')
        else:
            proj_x, proj_xr = proj_wx, proj_wxr
            desc_mask = word_mask.dimshuffle(0, 1, 'x')

        """
        Build question bidir RNN
        """
        proj_q, proj_qr = build_bidir_model(q, q_mask,
                                            tparams,
                                            options, sfx="word",
                                            nsteps=qlen,
                                            truncate=options['truncate'],
                                            use_dropout=options['use_dropout'],
                                            use_noise=use_noise,
                                            name="encoder_q")

        desc_rep = concatenate([proj_x[0],
                                proj_xr[0][::-1]],
                                axis=-1)

        q_rep = concatenate([proj_q[0][-1],
                            proj_qr[0][::-1][0]],
                            axis=-1)

    else:
        if options['use_sent_reps']:
            desc_wrep = desc_wrep.reshape((x.shape[0],
                                           x.shape[1],
                                           x.shape[2],
                                           -1))

            mean_desc_wrep = ((desc_wrep * word_mask.dimshuffle(0, 1, 2, 'x')).sum(0) /
                (word_mask.sum(0).dimshuffle(0, 1, 'x') + 1e-8))

            proj_sx = build_nonbidir_model(mean_desc_wrep,
                                           sent_mask,
                                           tparams,
                                           options,
                                           sfx="sent",
                                           nsteps=slen,
                                           truncate=options['truncate'],
                                           name="encoder_desc_sent")
            proj_x = proj_sx
            desc_mask = sent_mask.dimshuffle(0, 1, 'x')
        else:
            proj_x = proj_wx
            desc_mask = word_mask.dimshuffle(0, 1, 'x')
        """
        Build question bidir RNN
        """
        proj_q = build_nonbidir_model(q, q_mask,
                                            tparams,
                                            options, sfx="word",
                                            nsteps=qlen,
                                            truncate=options['truncate'],
                                            use_dropout=options['use_dropout'],
                                            use_noise=use_noise,
                                            name="encoder_q")

        desc_rep = proj_x
        q_rep = proj_q[-1]

    g_desc_ave = 0.

    if options['use_desc_skip_c_g']:
        desc_mean = (desc_rep * desc_mask).sum(0) / \
                tensor.cast(desc_mask.sum(0), 'float32')

        g_desc_ave = get_layer('ff')[1](tparams,
                                        desc_mean,
                                        options,
                                        prefix='ff_out_mean_d',
                                        use_bias=False,
                                        activ='Linear')

    desc_ctx, alphas = build_attention(tparams,
                                       options,
                                       desc_rep,
                                       sent_mask \
                                               if options['use_sent_reps'] else word_mask,
                                       slen \
                                               if options['use_sent_reps'] else wlen,
                                       q=q_rep)

    opt_ret['dec_alphas'] = alphas
    opt_ret['desc_ctx'] = desc_ctx

    g_ctx = get_layer('ff')[1](tparams,
                               desc_ctx,
                               options,
                               prefix='ff_out_ctx',
                               use_bias=False,
                               activ='Linear')

    g_q = get_layer('ff')[1](tparams,
                             q_rep,
                             options,
                             prefix='ff_out_q',
                             activ='Linear')

    if options['use_elu_g']:
        g_out = ELU(g_ctx + g_q + g_desc_ave)
    else:
        g_out = Tanh(g_ctx + g_q + g_desc_ave)

    if options['use_dropout']:
        g_out = dropout_layer(g_out, use_noise,
                              p=options['dropout_rate'])

    logit = get_layer('ff')[1](tparams,
                               g_out,
                               options,
                               prefix='ff_logit',
                               activ='Linear')

    probs = Softmax(logit)
    hinge_cost = multiclass_hinge_loss(probs, y)

    # compute the cost
    cost, errors, ent_errors, ent_derrors = nll_simple(y,
                                                       probs,
                                                       cost_ent_mask=cost_mask,
                                                       cost_ent_desc_mask=em)
    cost = cost #+ 1e-2 * hinge_cost
    #cost = hinge_cost
    vals = OrderedDict({'desc': x,
                        'word_mask': word_mask,
                        'q': q,
                        'q_mask': q_mask,
                        'ans': y,
                        'wlen': wlen,
                        'ent_mask': em,
                        'qlen': qlen})

    if options['use_sent_reps']:
        vals['slen'] = slen

    return trng, use_noise, vals, opt_ret, \
            cost, errors, ent_errors, ent_derrors, \
            probs
Example #8
0
    def fprop(self,
              state_below,
              memory,
              w_t_before,
              w_t_pre_before=None,
              time_idxs=None):

        if time_idxs is None:
            logger.info("Time indices are empty!")
            time_idxs = self.time_idxs

        fork_outs = self.state_fork_layer.fprop(state_below)
        idx = 0
        # First things first, content based addressing:
        if not self.use_local_att:
            beta_pre = fork_outs[self.names[0]]
            beta = TT.nnet.softplus(beta_pre).reshape((beta_pre.shape[0],))

            if (state_below.ndim != beta.ndim and beta.ndim == 2
                    and state_below.ndim == 3):
                beta = beta.reshape((state_below.shape[0], state_below.shape[1]))
            elif (state_below.ndim != beta.ndim and beta.ndim == 1
                    and state_below.ndim == 2):
                beta = beta.reshape((state_below.shape[0],))
            else:
                raise ValueError("Unknown shape for beta!")
            beta = TT.shape_padright(beta)
            idx = 1

        key_pre = fork_outs[self.names[idx]]
        idx += 1
        key_t = key_pre
        sim_vals = self.mem_similarity(key_t, memory)

        weights = sim_vals
        new_pre_weights = None

        if self.smoothed_diff_weights:
            dw_scaler = fork_outs[self.names[idx]]
            dw_scaler = TT.addbroadcast(dw_scaler, 1)
            weights = sim_vals - Sigmoid(dw_scaler) * w_t_pre_before
            new_pre_weights = self.mem_weight_decay * sim_vals + (1 - \
                    self.mem_weight_decay) * w_t_pre_before
            idx += 1
        std = 5

        """
        if self.use_local_att:
            mean = as_floatX(self.mem_nel) * Sigmoid(weights*self.mean_pred.fprop(state_below))
            exp_ws = -(time_idxs - mean)**2 / (2.0 * std)
            weights = exp_ws * weights
        """

        if self.use_local_att:
            w_tc = softmax3(weights) if weights.ndim == 3 else TT.nnet.softmax(weights)
        else:
            if weights.ndim == 3 and beta.ndim == 2:
                beta = beta.dimshuffle('x', 0, 1)
                w_tc = softmax3(weights * beta)
            else:
                # Content based weights:
                w_tc = TT.nnet.softmax(weights * beta)

        if self.use_local_att:
            first_loc_layer = Tanh(self.state_below_local.fprop(state_below) +\
                    self.weights_below_local.fprop(weights))
            mean = as_floatX(self.mem_nel) * Sigmoid(self.mean_pred.fprop(first_loc_layer))
            mean = TT.addbroadcast(mean, 1)
            exp_ws = TT.exp(-((time_idxs - mean)**2) / (2.0 * std))
            w_tc = exp_ws * w_tc
            w_tc = w_tc / w_tc.sum(axis=1, keepdims=True)

        if self.use_loc_based_addressing:
            # Location based addressing:
            g_t_pre = fork_outs[self.names[idx]]
            g_t = Sigmoid(g_t_pre).reshape((g_t_pre.shape[0],))

            if (state_below.ndim != g_t.ndim and g_t.ndim == 2
                    and state_below.ndim == 3):
                g_t = g_t.reshape((state_below.shape[0], state_below.shape[1]))
            elif (state_below.ndim != g_t.ndim and g_t.ndim == 1
                    and state_below.ndim == 2):
                g_t = g_t.reshape((state_below.shape[0],))
            else:
                raise ValueError("Unknown shape for g_t!")

            g_t = TT.shape_padright(g_t)
            w_tg = g_t * w_tc + (1 - g_t) * w_t_before
            shifts_pre = fork_outs[self.names[idx + 1]]

            if shifts_pre.ndim == 2:
                if self.use_multiscale_shifts:

                    if self.use_scale_layer:
                        scales = TT.exp(self.scale_layer.fprop(state_below))
                        scales = scales.dimshuffle(0, 'x', 1)
                    else:
                        scales = TT.exp(TT.arange(self.scale_size).dimshuffle('x', 'x', 0))

                    shifts_pre = shifts_pre.reshape((state_below.shape[0],
                                                     -1,
                                                     self.scale_size))

                    shifts_pre = (shifts_pre * scales).sum(-1)

                    if self.shift_width >= 0:
                        shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1))

                elif self.shift_width >= 0:
                    shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1))
                else:
                    shifts_pre = shifts_pre.reshape(
                        (state_below.shape[0], self.mem_nel))

                if state_below.ndim == 3:
                    shifts_pre = shifts_pre.dimshuffle(0, 1, 'x')
                    shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True).dimshuffle(0, 'x', 'x')
                else:
                    shifts_pre = shifts_pre.dimshuffle(0, 1)
                    shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True)
                    shifts_pre = shifts_pre.dimshuffle(0, 1, 'x')
            elif shifts_pre.ndim == 1:
                if self.use_multiscale_shifts:
                    if self.use_scale_layer:
                        scales = TT.exp(self.scale_layer.fprop(state_below))
                    else:
                        scales = TT.exp(TT.arange(self.scale_size))

                    shifts_pre = shifts_pre.reshape((-1, self.scale_size))
                    shifts_pre = (shifts_pre * scales).sum(-1)
                    if self.shift_width >= 0:
                        shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1))
                    if self.shift_width >= 0:
                        shifts_pre = shifts_pre.reshape((-1, 1))
                elif self.shift_width >= 0:
                    shifts_pre = shifts_pre.reshape((-1, 1))
                else:
                    shifts_pre = shifts_pre.reshape((self.mem_nel,))

                if state_below.ndim == 2:
                    shifts_pre = TT.shape_padright(shifts_pre)
                    shifts_pre = shifts_pre - shifts_pre.max(0, keepdims=True)

            shifts = TT.exp(shifts_pre)
            if shifts.ndim == 2:
                shifts = shifts / shifts.sum(axis=0, keepdims=True)
            elif shifts.ndim == 3:
                shifts = shifts / shifts.sum(axis=1, keepdims=True)

            CC = CircularConvolveAdvIndexing if self.use_adv_indexing else\
                    CircularConvolve

            w_t_hat = CC()(weights=w_tg, shifts=shifts,
                           mem_size=self.mem_nel,
                           shift_width=self.shift_width)

            if self.use_reinforce:
                if w_t_hat.ndim == 2:
                    w_t = TT.nnet.softmax(w_t_hat)
                elif w_t_hat.ndim == 3:
                    w_t = softmax3(w_t_hat)
            else:
                gamma_pre = fork_outs[self.names[4]]
                assert w_t_hat.ndim == gamma_pre.ndim, ("The number of dimensions for "
                                                        " w_t_hat and gamma_pre should "
                                                        " be the same")

                if gamma_pre.ndim == 1:
                    gamma_pre = gamma_pre
                else:
                    gamma_pre = gamma_pre.reshape((gamma_pre.shape[0],))

                gamma_pre = TT.shape_padright(gamma_pre)
                gamma = TT.nnet.softplus(gamma_pre) + const(1)

                w_t = (abs(w_t_hat + const(1e-16))**gamma) + const(1e-42)
                if (state_below.ndim != shifts_pre.ndim and w_t.ndim == 2
                        and state_below.ndim == 3):
                    w_t = w_t.reshape((state_below.shape[0], state_below.shape[1]))
                    w_t = w_t.dimshuffle(0, 1, 'x')
                elif (state_below.ndim != w_t.ndim and w_t.ndim == 1
                        and state_below.ndim == 2):
                    w_t = w_t.reshape((state_below.shape[0],))
                    w_t = w_t.dimshuffle(0, 'x')

                if w_t.ndim == 2:
                    w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6))
                elif w_t.ndim == 3:
                    w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6))
        else:
            w_t = w_tc

        return [w_t], [new_pre_weights]