예제 #1
0
def build_attention(tparams,
                    options,
                    desc,
                    desc_mask,
                    dlen,
                    q,
                    q_mask=None,
                    sfx=None,
                    name=None):

    if desc.ndim != desc_mask.ndim:
        desc_mask_ = desc_mask.dimshuffle(0, 1, 'x')

    assert desc.ndim == desc_mask_.ndim

    if q_mask is not None:
        assert q.ndim == q_mask.ndim
        q *= q_mask

    masked_desc = desc * desc_mask_

    desc_in = desc.reshape((-1, desc.shape[-1]))
    projd = get_layer('ff')[1](tparams=tparams,
                               state_below=desc_in,
                               options=options,
                               prefix='ff_att_ctx',
                               activ='Linear')

    projq = get_layer('ff')[1](tparams, q,
                               options,
                               prefix='ff_att_q',
                               use_bias=False,
                               activ='Linear')

    """
    Unnormalized dist metric between the rep of desc and q.
    """
    sim_vals = 0
    if options['use_dq_sims']:
        q_proj = dot(q, tparams['ff_att_bi_dq'])
        desc_proj = dot(masked_desc,
                        tparams['ff_att_bi_dq']).reshape((masked_desc.shape[0],
                        masked_desc.shape[1], -1))
        sim_vals = (desc_proj * q_proj.dimshuffle('x', 0, 1)).sum(-1)
        sim_vals = sim_vals.dimshuffle(0, 1, 'x')

    projd = projd.reshape((masked_desc.shape[0], masked_desc.shape[1], -1))

    #Intermediate layer for annotation values.
    proj_att = Tanh(projd + projq.dimshuffle('x', 0, 1) + sim_vals)
    W_proj = tparams['ff_att_proj'].dimshuffle('x', 'x', 0)
    dot_proj = (W_proj * proj_att).sum(-1)
    pre_softmax = dot_proj
    alphas = Masked_Softmax(pre_softmax, mask=desc_mask, ax=0).dimshuffle(0, 1, 'x')
    ctx = (masked_desc * alphas).sum(0)

    return ctx, alphas
예제 #2
0
def build_bidir_model(inp,
                      inp_mask,
                      tparams,
                      options,
                      sfx=None,
                      nsteps=None,
                      use_dropout=False,
                      use_noise=None,
                      truncate=None,
                      name=None):

    if use_dropout:
        assert use_noise is not None

    assert name is not None
    assert sfx is not None

    #inpr = inp[::-1]
    inpr_mask = inp_mask[::-1]

    n_timesteps = inp.shape[0]
    n_samples = inp.shape[1]

    emb = dot(inp, tparams['Wemb_%s' % sfx])
    emb = emb.reshape([n_timesteps, n_samples, -1])

    if use_dropout:
        emb = dropout_layer(emb, use_noise,
                            p=options['dropout_rate'])

    """
    Forward RNN
    """
    proj = get_layer(options[name])[1](tparams=tparams,
                                       state_below=emb,
                                       options=options,
                                       prefix=name,
                                       nsteps=nsteps,
                                       truncate=truncate,
                                       mask=inp_mask)

    """
    Reverse RNN.
    """
    #embr = dot(inpr, tparams['Wemb_%s' % sfx])
    embr = emb[::-1]#embr.reshape([n_timesteps, n_samples, -1])
    projr = get_layer(options[name])[1](tparams=tparams,
                                        state_below=embr,
                                        options=options,
                                        prefix=name + "_r",
                                        nsteps=nsteps,
                                        truncate=truncate,
                                        mask=inpr_mask)
    return proj, projr
예제 #3
0
def build_model(tparams,
                options,
                prepare_data_fn,
                valid=None,
                cost_mask=None):

    opt_ret = dict()

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples, description:
    if options['use_sent_reps']:
        x = tensor.tensor3('desc', dtype='uint32')
        word_mask = tensor.tensor3('desc_mask', dtype='float32')
        sent_mask = tensor.cast(word_mask.sum(0) > 0, "float32")
        slen = tensor.scalar('slen', dtype='uint32')
    else:
        x = tensor.matrix('desc', dtype="uint32")
        word_mask = tensor.matrix('desc_mask', dtype='float32')

    q = tensor.matrix('q', dtype="uint32")
    q_mask = tensor.matrix('q_mask', dtype="float32")
    y = tensor.vector('ans', dtype='uint32')
    em = tensor.matrix('entity_mask', dtype="float32")

    wlen = tensor.scalar('wlen', dtype='uint32')
    qlen = tensor.scalar('qlen', dtype='uint32')
    if options['debug']:
        if valid.done:
            valid.reset()

        valid_d = next(valid)
        d_, q_, a_, em_ = valid_d[0], valid_d[1], valid_d[2], valid_d[3]

        if options['use_sent_reps']:
            d_, d_mask_, q_, q_mask_, wlen_, slen_, qlen_ = prepare_data_fn(d_, q_)
        else:
            d_, d_mask_, q_, q_mask_, wlen_, qlen_ = prepare_data_fn(d_, q_)

        print "Debugging is enabled."

        theano.config.compute_test_value = 'warn'
        x.tag.test_value = numpy.array(d_).astype("uint32")
        word_mask.tag.test_value = numpy.array(d_mask_).astype("float32")
        q.tag.test_value = numpy.array(q_).astype("uint32")
        q_mask.tag.test_value = numpy.array(q_mask_).astype("float32")
        y.tag.test_value = numpy.array(a_).astype("uint32")
        em.tag.test_value = numpy.array(em_).astype("float32")
        wlen.tag.test_value = numpy.array(wlen_).astype("uint32")
        qlen.tag.test_value = numpy.array(qlen_).astype("uint32")

        if options['use_sent_reps']:
            slen.tag.test_value = numpy.array(slen_).astype("uint32")
            sent_mask.tag.test_value = numpy.array(d_mask_.sum(0) > 0, dtype="float32")

    if x.ndim == 3:
        x_rshp = x.reshape((x.shape[0], x.shape[1]*x.shape[2]))
    else:
        x_rshp = x

    """
        Bidirectional for the description.
    """
    if options['use_bidir']:
        proj_wx, proj_wxr = build_bidir_model(x_rshp,
                                              word_mask,
                                              tparams,
                                              options,
                                              sfx="word",
                                              nsteps=wlen,
                                              truncate=options['truncate'],
                                              use_dropout=options['use_dropout'],
                                              use_noise=use_noise,
                                              name="encoder_desc_word")

        desc_wrep = concatenate([proj_wx[0],
                                proj_wxr[0][::-1]],
                                axis=-1)
    else:
        proj_wx = build_nonbidir_model(x_rshp,
                                       word_mask,
                                       tparams,
                                       options,
                                       sfx="word",
                                       nsteps=wlen,
                                       truncate=options['truncate'],
                                       use_dropout=options['use_dropout'],
                                       use_noise=use_noise,
                                       name="encoder_desc_word")
        desc_wrep = proj_wx

    if options['use_bidir']:
        if options['use_sent_reps']:
            desc_wrep = desc_wrep.reshape((x.shape[0],
                                           x.shape[1],
                                           x.shape[2],
                                           -1))

            mean_desc_wrep = ((desc_wrep * word_mask.dimshuffle(0, 1, 2, 'x')).sum(0) /
                (word_mask.sum(0).dimshuffle(0, 1, 'x') + 1e-8))

            proj_sx, proj_sxr = build_bidir_model(mean_desc_wrep,
                                                  sent_mask,
                                                  tparams,
                                                  options,
                                                  sfx="sent",
                                                  nsteps=slen,
                                                  truncate=options['truncate'],
                                                  name="encoder_desc_sent")

            proj_x, proj_xr = proj_sx, proj_sxr
            desc_mask = sent_mask.dimshuffle(0, 1, 'x')
        else:
            proj_x, proj_xr = proj_wx, proj_wxr
            desc_mask = word_mask.dimshuffle(0, 1, 'x')

        """
        Build question bidir RNN
        """
        proj_q, proj_qr = build_bidir_model(q, q_mask,
                                            tparams,
                                            options, sfx="word",
                                            nsteps=qlen,
                                            truncate=options['truncate'],
                                            use_dropout=options['use_dropout'],
                                            use_noise=use_noise,
                                            name="encoder_q")

        desc_rep = concatenate([proj_x[0],
                                proj_xr[0][::-1]],
                                axis=-1)

        q_rep = concatenate([proj_q[0][-1],
                            proj_qr[0][::-1][0]],
                            axis=-1)

    else:
        if options['use_sent_reps']:
            desc_wrep = desc_wrep.reshape((x.shape[0],
                                           x.shape[1],
                                           x.shape[2],
                                           -1))

            mean_desc_wrep = ((desc_wrep * word_mask.dimshuffle(0, 1, 2, 'x')).sum(0) /
                (word_mask.sum(0).dimshuffle(0, 1, 'x') + 1e-8))

            proj_sx = build_nonbidir_model(mean_desc_wrep,
                                           sent_mask,
                                           tparams,
                                           options,
                                           sfx="sent",
                                           nsteps=slen,
                                           truncate=options['truncate'],
                                           name="encoder_desc_sent")
            proj_x = proj_sx
            desc_mask = sent_mask.dimshuffle(0, 1, 'x')
        else:
            proj_x = proj_wx
            desc_mask = word_mask.dimshuffle(0, 1, 'x')
        """
        Build question bidir RNN
        """
        proj_q = build_nonbidir_model(q, q_mask,
                                            tparams,
                                            options, sfx="word",
                                            nsteps=qlen,
                                            truncate=options['truncate'],
                                            use_dropout=options['use_dropout'],
                                            use_noise=use_noise,
                                            name="encoder_q")

        desc_rep = proj_x
        q_rep = proj_q[-1]

    g_desc_ave = 0.

    if options['use_desc_skip_c_g']:
        desc_mean = (desc_rep * desc_mask).sum(0) / \
                tensor.cast(desc_mask.sum(0), 'float32')

        g_desc_ave = get_layer('ff')[1](tparams,
                                        desc_mean,
                                        options,
                                        prefix='ff_out_mean_d',
                                        use_bias=False,
                                        activ='Linear')

    desc_ctx, alphas = build_attention(tparams,
                                       options,
                                       desc_rep,
                                       sent_mask \
                                               if options['use_sent_reps'] else word_mask,
                                       slen \
                                               if options['use_sent_reps'] else wlen,
                                       q=q_rep)

    opt_ret['dec_alphas'] = alphas
    opt_ret['desc_ctx'] = desc_ctx

    g_ctx = get_layer('ff')[1](tparams,
                               desc_ctx,
                               options,
                               prefix='ff_out_ctx',
                               use_bias=False,
                               activ='Linear')

    g_q = get_layer('ff')[1](tparams,
                             q_rep,
                             options,
                             prefix='ff_out_q',
                             activ='Linear')

    if options['use_elu_g']:
        g_out = ELU(g_ctx + g_q + g_desc_ave)
    else:
        g_out = Tanh(g_ctx + g_q + g_desc_ave)

    if options['use_dropout']:
        g_out = dropout_layer(g_out, use_noise,
                              p=options['dropout_rate'])

    logit = get_layer('ff')[1](tparams,
                               g_out,
                               options,
                               prefix='ff_logit',
                               activ='Linear')

    probs = Softmax(logit)
    hinge_cost = multiclass_hinge_loss(probs, y)

    # compute the cost
    cost, errors, ent_errors, ent_derrors = nll_simple(y,
                                                       probs,
                                                       cost_ent_mask=cost_mask,
                                                       cost_ent_desc_mask=em)
    cost = cost #+ 1e-2 * hinge_cost
    #cost = hinge_cost
    vals = OrderedDict({'desc': x,
                        'word_mask': word_mask,
                        'q': q,
                        'q_mask': q_mask,
                        'ans': y,
                        'wlen': wlen,
                        'ent_mask': em,
                        'qlen': qlen})

    if options['use_sent_reps']:
        vals['slen'] = slen

    return trng, use_noise, vals, opt_ret, \
            cost, errors, ent_errors, ent_derrors, \
            probs
예제 #4
0
def init_params(options):
    params = OrderedDict()

    # embedding
    params['Wemb_word'] = norm_weight(options['n_words_q'],
                                      options['dim_word_desc'])

    mult = 2
    if options['ms_nlayers'] > 1 and (options['encoder_desc'] == 'lstm_ms' or \
            options['encoder_desc'] == 'lstm_max_ms'):

        mult = options['ms_nlayers']
        if options['use_bidir']:
            mult *= 2

    if options['use_dq_sims']:
        params['ff_att_bi_dq'] = \
                norm_weight(mult * options['dim'],
                            mult * options['dim'])

    params['ff_att_proj'] = norm_vec(options['dim'])

    # encoder: bidirectional RNN
    params = get_layer(options['encoder_desc_word'])[0](options,
                                                        params,
                                                        prefix='encoder_desc_word',
                                                        nin=options['dim_word_desc'],
                                                        dim=options['dim'])
    params = get_layer(options['encoder_q'])[0](options,
                                                params,
                                                prefix='encoder_q',
                                                nin=options['dim_word_q'],
                                                dim=options['dim'])

    if options['use_bidir']:
        params = get_layer(options['encoder_desc_word'])[0](options,
                                                            params,
                                                            prefix='encoder_desc_word_r',
                                                            nin=options['dim_word_desc'],
                                                            dim=options['dim'])
        params = get_layer(options['encoder_q'])[0](options,
                                                    params,
                                                    prefix='encoder_q_r',
                                                    nin=options['dim_word_q'],
                                                    dim=options['dim'])


    if options['use_sent_reps']:
        params['Wemb_sent'] = norm_weight(mult * options['dim'],
                                          mult * options['dim'])
        # encoder: bidirectional RNN
        params = get_layer(options['encoder_desc_sent'])[0](options,
                                                            params,
                                                            prefix='encoder_desc_sent',
                                                            nin=mult * options['dim'],
                                                            dim=options['dim'])
        if options['use_bidir']:
            params = get_layer(options['encoder_desc_sent'])[0](options,
                                                                params,
                                                                prefix='encoder_desc_sent_r',
                                                                nin=mult * options['dim'],
                                                                dim=options['dim'])
    ctxdim = mult * options['dim']
    logger.info("context dimensions is %d" % ctxdim)
    params = get_layer('ff')[0](options, params,
                                prefix='ff_att_ctx',
                                nin=ctxdim,
                                nout=options['dim'])

    # readout
    params = get_layer('ff')[0](options, params,
                                prefix='ff_att_q',
                                nin=ctxdim,
                                nout=options['dim'],
                                use_bias=False,
                                ortho=False)


    if options['use_desc_skip_c_g']:
        # readout for mean pooled desc
        params = get_layer('ff')[0](options, params,
                                    prefix='ff_out_mean_d',
                                    nin=ctxdim,
                                    nout=options['dim_word_ans'],
                                    use_bias=False,
                                    ortho=False)


    params = get_layer('ff')[0](options, params,
                                prefix='ff_out_q',
                                nin=ctxdim,
                                nout=options['dim_word_ans'],
                                ortho=False)

    params = get_layer('ff')[0](options, params,
                                prefix='ff_out_ctx',
                                nin=ctxdim,
                                nout=options['dim_word_ans'],
                                use_bias=False,
                                ortho=False)

    params = get_layer('ff')[0](options, params,
                                prefix='ff_logit',
                                nin=options['dim_word_ans'],
                                nout=options['n_words_ans'],
                                ortho=False)
    return params