def fprop(self, state_before, mem_before, cell_before, forget_below, input_below, output_below, state_below): state_fork_outs = self.state_before_fork_layer.fprop(state_before) mem_fork_outs = self.mem_before_fork_layer.fprop(mem_before) inp = Sigmoid(input_below + mem_fork_outs[self.mbf_names[1]] + \ state_fork_outs[self.sbf_names[1]]) output = Sigmoid(output_below + mem_fork_outs[self.mbf_names[2]] + \ state_fork_outs[self.sbf_names[2]]) forget = Sigmoid(forget_below + mem_fork_outs[self.mbf_names[0]] + \ state_fork_outs[self.sbf_names[0]]) cell = Tanh(state_below + mem_fork_outs[self.mbf_names[3]] + state_fork_outs[self.sbf_names[3]]) c_t = inp * cell + forget * cell_before h_t = output * self.activ(c_t) return h_t, c_t
def build_attention(tparams, options, desc, desc_mask, dlen, q, q_mask=None, sfx=None, name=None): if desc.ndim != desc_mask.ndim: desc_mask_ = desc_mask.dimshuffle(0, 1, 'x') assert desc.ndim == desc_mask_.ndim if q_mask is not None: assert q.ndim == q_mask.ndim q *= q_mask masked_desc = desc * desc_mask_ desc_in = desc.reshape((-1, desc.shape[-1])) projd = get_layer('ff')[1](tparams=tparams, state_below=desc_in, options=options, prefix='ff_att_ctx', activ='Linear') projq = get_layer('ff')[1](tparams, q, options, prefix='ff_att_q', use_bias=False, activ='Linear') """ Unnormalized dist metric between the rep of desc and q. """ sim_vals = 0 if options['use_dq_sims']: q_proj = dot(q, tparams['ff_att_bi_dq']) desc_proj = dot(masked_desc, tparams['ff_att_bi_dq']).reshape((masked_desc.shape[0], masked_desc.shape[1], -1)) sim_vals = (desc_proj * q_proj.dimshuffle('x', 0, 1)).sum(-1) sim_vals = sim_vals.dimshuffle(0, 1, 'x') projd = projd.reshape((masked_desc.shape[0], masked_desc.shape[1], -1)) #Intermediate layer for annotation values. proj_att = Tanh(projd + projq.dimshuffle('x', 0, 1) + sim_vals) W_proj = tparams['ff_att_proj'].dimshuffle('x', 'x', 0) dot_proj = (W_proj * proj_att).sum(-1) pre_softmax = dot_proj alphas = Masked_Softmax(pre_softmax, mask=desc_mask, ax=0).dimshuffle(0, 1, 'x') ctx = (masked_desc * alphas).sum(0) return ctx, alphas
def _step_slice(mask, sbelow, sbelowx, xc_, sbefore, ctx_, alpha_, pctx_, cc_, U, Wc, Wd_att, U_att, c_tt, Ux, Wcx): # attention pstate_ = dot(sbefore, Wd_att) pctx__ = pctx_ + pstate_[None, :, :] pctx__ += xc_ pctx__ = Tanh(pctx__) alpha = dot(pctx__, U_att)+c_tt alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) alpha = tensor.exp(alpha) if context_mask: alpha = alpha * context_mask alpha = alpha / alpha.sum(0, keepdims=True) ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context preact = dot(sbefore, U) preact += sbelow preact += dot(ctx_, Wc) preact = Sigmoid(preact) r = _slice(preact, 0, dim) u = _slice(preact, 1, dim) preactx = dot(sbefore, Ux) preactx *= r preactx += sbelowx preactx += dot(ctx_, Wcx) h = Tanh(preactx) h = u * sbefore + (1. - u) * h h = mask[:, None] * h + (1. - mask)[:, None] * sbefore return h, ctx_, alpha.T
def _step(mask, sbelow, sbefore, cell_before): preact = dot(sbefore, param('U')) preact += sbelow preact += tparams[prfx(prefix, 'b')] f = Sigmoid(_slice(preact, 0, dim)) o = Sigmoid(_slice(preact, 1, dim)) c = Tanh(_slice(preact, 2, dim)) c = f * cell_before + (1 - f) * c c = mask * c + (1. - mask) * cell_before h = o * tensor.tanh(c) h = mask * h + (1. - mask) * sbefore return h, c
def _step(mask, sbelow, sbefore, cell_before, *args): preact = dot(sbefore, param('U')) preact += sbelow preact += param('b') i = Sigmoid(_slice(preact, 0, dim)) f = Sigmoid(_slice(preact, 1, dim)) o = Sigmoid(_slice(preact, 2, dim)) c = Tanh(_slice(preact, 3, dim)) c = f * cell_before + i * c c = mask * c + (1. - mask) * cell_before h = o * tensor.tanh(c) h = mask * h + (1. - mask) * sbefore return h, c
def _step_slice(mask, sbelow, sbelowx, sbefore, U, Ux): preact = dot(sbefore, U) preact += sbelow r = Sigmoid(_slice(preact, 0, dim)) u = Sigmoid(_slice(preact, 1, dim)) preactx = dot(r * sbefore, Ux) # preactx = preactx preactx = preactx + sbelowx h = Tanh(preactx) h = u * sbefore + (1. - u) * h h = mask[:, None] * h + (1. - mask)[:, None] * sbefore return h
def build_model(tparams, options, prepare_data_fn, valid=None, cost_mask=None): opt_ret = dict() trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples, description: if options['use_sent_reps']: x = tensor.tensor3('desc', dtype='uint32') word_mask = tensor.tensor3('desc_mask', dtype='float32') sent_mask = tensor.cast(word_mask.sum(0) > 0, "float32") slen = tensor.scalar('slen', dtype='uint32') else: x = tensor.matrix('desc', dtype="uint32") word_mask = tensor.matrix('desc_mask', dtype='float32') q = tensor.matrix('q', dtype="uint32") q_mask = tensor.matrix('q_mask', dtype="float32") y = tensor.vector('ans', dtype='uint32') em = tensor.matrix('entity_mask', dtype="float32") wlen = tensor.scalar('wlen', dtype='uint32') qlen = tensor.scalar('qlen', dtype='uint32') if options['debug']: if valid.done: valid.reset() valid_d = next(valid) d_, q_, a_, em_ = valid_d[0], valid_d[1], valid_d[2], valid_d[3] if options['use_sent_reps']: d_, d_mask_, q_, q_mask_, wlen_, slen_, qlen_ = prepare_data_fn(d_, q_) else: d_, d_mask_, q_, q_mask_, wlen_, qlen_ = prepare_data_fn(d_, q_) print "Debugging is enabled." theano.config.compute_test_value = 'warn' x.tag.test_value = numpy.array(d_).astype("uint32") word_mask.tag.test_value = numpy.array(d_mask_).astype("float32") q.tag.test_value = numpy.array(q_).astype("uint32") q_mask.tag.test_value = numpy.array(q_mask_).astype("float32") y.tag.test_value = numpy.array(a_).astype("uint32") em.tag.test_value = numpy.array(em_).astype("float32") wlen.tag.test_value = numpy.array(wlen_).astype("uint32") qlen.tag.test_value = numpy.array(qlen_).astype("uint32") if options['use_sent_reps']: slen.tag.test_value = numpy.array(slen_).astype("uint32") sent_mask.tag.test_value = numpy.array(d_mask_.sum(0) > 0, dtype="float32") if x.ndim == 3: x_rshp = x.reshape((x.shape[0], x.shape[1]*x.shape[2])) else: x_rshp = x """ Bidirectional for the description. """ if options['use_bidir']: proj_wx, proj_wxr = build_bidir_model(x_rshp, word_mask, tparams, options, sfx="word", nsteps=wlen, truncate=options['truncate'], use_dropout=options['use_dropout'], use_noise=use_noise, name="encoder_desc_word") desc_wrep = concatenate([proj_wx[0], proj_wxr[0][::-1]], axis=-1) else: proj_wx = build_nonbidir_model(x_rshp, word_mask, tparams, options, sfx="word", nsteps=wlen, truncate=options['truncate'], use_dropout=options['use_dropout'], use_noise=use_noise, name="encoder_desc_word") desc_wrep = proj_wx if options['use_bidir']: if options['use_sent_reps']: desc_wrep = desc_wrep.reshape((x.shape[0], x.shape[1], x.shape[2], -1)) mean_desc_wrep = ((desc_wrep * word_mask.dimshuffle(0, 1, 2, 'x')).sum(0) / (word_mask.sum(0).dimshuffle(0, 1, 'x') + 1e-8)) proj_sx, proj_sxr = build_bidir_model(mean_desc_wrep, sent_mask, tparams, options, sfx="sent", nsteps=slen, truncate=options['truncate'], name="encoder_desc_sent") proj_x, proj_xr = proj_sx, proj_sxr desc_mask = sent_mask.dimshuffle(0, 1, 'x') else: proj_x, proj_xr = proj_wx, proj_wxr desc_mask = word_mask.dimshuffle(0, 1, 'x') """ Build question bidir RNN """ proj_q, proj_qr = build_bidir_model(q, q_mask, tparams, options, sfx="word", nsteps=qlen, truncate=options['truncate'], use_dropout=options['use_dropout'], use_noise=use_noise, name="encoder_q") desc_rep = concatenate([proj_x[0], proj_xr[0][::-1]], axis=-1) q_rep = concatenate([proj_q[0][-1], proj_qr[0][::-1][0]], axis=-1) else: if options['use_sent_reps']: desc_wrep = desc_wrep.reshape((x.shape[0], x.shape[1], x.shape[2], -1)) mean_desc_wrep = ((desc_wrep * word_mask.dimshuffle(0, 1, 2, 'x')).sum(0) / (word_mask.sum(0).dimshuffle(0, 1, 'x') + 1e-8)) proj_sx = build_nonbidir_model(mean_desc_wrep, sent_mask, tparams, options, sfx="sent", nsteps=slen, truncate=options['truncate'], name="encoder_desc_sent") proj_x = proj_sx desc_mask = sent_mask.dimshuffle(0, 1, 'x') else: proj_x = proj_wx desc_mask = word_mask.dimshuffle(0, 1, 'x') """ Build question bidir RNN """ proj_q = build_nonbidir_model(q, q_mask, tparams, options, sfx="word", nsteps=qlen, truncate=options['truncate'], use_dropout=options['use_dropout'], use_noise=use_noise, name="encoder_q") desc_rep = proj_x q_rep = proj_q[-1] g_desc_ave = 0. if options['use_desc_skip_c_g']: desc_mean = (desc_rep * desc_mask).sum(0) / \ tensor.cast(desc_mask.sum(0), 'float32') g_desc_ave = get_layer('ff')[1](tparams, desc_mean, options, prefix='ff_out_mean_d', use_bias=False, activ='Linear') desc_ctx, alphas = build_attention(tparams, options, desc_rep, sent_mask \ if options['use_sent_reps'] else word_mask, slen \ if options['use_sent_reps'] else wlen, q=q_rep) opt_ret['dec_alphas'] = alphas opt_ret['desc_ctx'] = desc_ctx g_ctx = get_layer('ff')[1](tparams, desc_ctx, options, prefix='ff_out_ctx', use_bias=False, activ='Linear') g_q = get_layer('ff')[1](tparams, q_rep, options, prefix='ff_out_q', activ='Linear') if options['use_elu_g']: g_out = ELU(g_ctx + g_q + g_desc_ave) else: g_out = Tanh(g_ctx + g_q + g_desc_ave) if options['use_dropout']: g_out = dropout_layer(g_out, use_noise, p=options['dropout_rate']) logit = get_layer('ff')[1](tparams, g_out, options, prefix='ff_logit', activ='Linear') probs = Softmax(logit) hinge_cost = multiclass_hinge_loss(probs, y) # compute the cost cost, errors, ent_errors, ent_derrors = nll_simple(y, probs, cost_ent_mask=cost_mask, cost_ent_desc_mask=em) cost = cost #+ 1e-2 * hinge_cost #cost = hinge_cost vals = OrderedDict({'desc': x, 'word_mask': word_mask, 'q': q, 'q_mask': q_mask, 'ans': y, 'wlen': wlen, 'ent_mask': em, 'qlen': qlen}) if options['use_sent_reps']: vals['slen'] = slen return trng, use_noise, vals, opt_ret, \ cost, errors, ent_errors, ent_derrors, \ probs
def fprop(self, state_below, memory, w_t_before, w_t_pre_before=None, time_idxs=None): if time_idxs is None: logger.info("Time indices are empty!") time_idxs = self.time_idxs fork_outs = self.state_fork_layer.fprop(state_below) idx = 0 # First things first, content based addressing: if not self.use_local_att: beta_pre = fork_outs[self.names[0]] beta = TT.nnet.softplus(beta_pre).reshape((beta_pre.shape[0],)) if (state_below.ndim != beta.ndim and beta.ndim == 2 and state_below.ndim == 3): beta = beta.reshape((state_below.shape[0], state_below.shape[1])) elif (state_below.ndim != beta.ndim and beta.ndim == 1 and state_below.ndim == 2): beta = beta.reshape((state_below.shape[0],)) else: raise ValueError("Unknown shape for beta!") beta = TT.shape_padright(beta) idx = 1 key_pre = fork_outs[self.names[idx]] idx += 1 key_t = key_pre sim_vals = self.mem_similarity(key_t, memory) weights = sim_vals new_pre_weights = None if self.smoothed_diff_weights: dw_scaler = fork_outs[self.names[idx]] dw_scaler = TT.addbroadcast(dw_scaler, 1) weights = sim_vals - Sigmoid(dw_scaler) * w_t_pre_before new_pre_weights = self.mem_weight_decay * sim_vals + (1 - \ self.mem_weight_decay) * w_t_pre_before idx += 1 std = 5 """ if self.use_local_att: mean = as_floatX(self.mem_nel) * Sigmoid(weights*self.mean_pred.fprop(state_below)) exp_ws = -(time_idxs - mean)**2 / (2.0 * std) weights = exp_ws * weights """ if self.use_local_att: w_tc = softmax3(weights) if weights.ndim == 3 else TT.nnet.softmax(weights) else: if weights.ndim == 3 and beta.ndim == 2: beta = beta.dimshuffle('x', 0, 1) w_tc = softmax3(weights * beta) else: # Content based weights: w_tc = TT.nnet.softmax(weights * beta) if self.use_local_att: first_loc_layer = Tanh(self.state_below_local.fprop(state_below) +\ self.weights_below_local.fprop(weights)) mean = as_floatX(self.mem_nel) * Sigmoid(self.mean_pred.fprop(first_loc_layer)) mean = TT.addbroadcast(mean, 1) exp_ws = TT.exp(-((time_idxs - mean)**2) / (2.0 * std)) w_tc = exp_ws * w_tc w_tc = w_tc / w_tc.sum(axis=1, keepdims=True) if self.use_loc_based_addressing: # Location based addressing: g_t_pre = fork_outs[self.names[idx]] g_t = Sigmoid(g_t_pre).reshape((g_t_pre.shape[0],)) if (state_below.ndim != g_t.ndim and g_t.ndim == 2 and state_below.ndim == 3): g_t = g_t.reshape((state_below.shape[0], state_below.shape[1])) elif (state_below.ndim != g_t.ndim and g_t.ndim == 1 and state_below.ndim == 2): g_t = g_t.reshape((state_below.shape[0],)) else: raise ValueError("Unknown shape for g_t!") g_t = TT.shape_padright(g_t) w_tg = g_t * w_tc + (1 - g_t) * w_t_before shifts_pre = fork_outs[self.names[idx + 1]] if shifts_pre.ndim == 2: if self.use_multiscale_shifts: if self.use_scale_layer: scales = TT.exp(self.scale_layer.fprop(state_below)) scales = scales.dimshuffle(0, 'x', 1) else: scales = TT.exp(TT.arange(self.scale_size).dimshuffle('x', 'x', 0)) shifts_pre = shifts_pre.reshape((state_below.shape[0], -1, self.scale_size)) shifts_pre = (shifts_pre * scales).sum(-1) if self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1)) elif self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1)) else: shifts_pre = shifts_pre.reshape( (state_below.shape[0], self.mem_nel)) if state_below.ndim == 3: shifts_pre = shifts_pre.dimshuffle(0, 1, 'x') shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True).dimshuffle(0, 'x', 'x') else: shifts_pre = shifts_pre.dimshuffle(0, 1) shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True) shifts_pre = shifts_pre.dimshuffle(0, 1, 'x') elif shifts_pre.ndim == 1: if self.use_multiscale_shifts: if self.use_scale_layer: scales = TT.exp(self.scale_layer.fprop(state_below)) else: scales = TT.exp(TT.arange(self.scale_size)) shifts_pre = shifts_pre.reshape((-1, self.scale_size)) shifts_pre = (shifts_pre * scales).sum(-1) if self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1)) if self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, 1)) elif self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, 1)) else: shifts_pre = shifts_pre.reshape((self.mem_nel,)) if state_below.ndim == 2: shifts_pre = TT.shape_padright(shifts_pre) shifts_pre = shifts_pre - shifts_pre.max(0, keepdims=True) shifts = TT.exp(shifts_pre) if shifts.ndim == 2: shifts = shifts / shifts.sum(axis=0, keepdims=True) elif shifts.ndim == 3: shifts = shifts / shifts.sum(axis=1, keepdims=True) CC = CircularConvolveAdvIndexing if self.use_adv_indexing else\ CircularConvolve w_t_hat = CC()(weights=w_tg, shifts=shifts, mem_size=self.mem_nel, shift_width=self.shift_width) if self.use_reinforce: if w_t_hat.ndim == 2: w_t = TT.nnet.softmax(w_t_hat) elif w_t_hat.ndim == 3: w_t = softmax3(w_t_hat) else: gamma_pre = fork_outs[self.names[4]] assert w_t_hat.ndim == gamma_pre.ndim, ("The number of dimensions for " " w_t_hat and gamma_pre should " " be the same") if gamma_pre.ndim == 1: gamma_pre = gamma_pre else: gamma_pre = gamma_pre.reshape((gamma_pre.shape[0],)) gamma_pre = TT.shape_padright(gamma_pre) gamma = TT.nnet.softplus(gamma_pre) + const(1) w_t = (abs(w_t_hat + const(1e-16))**gamma) + const(1e-42) if (state_below.ndim != shifts_pre.ndim and w_t.ndim == 2 and state_below.ndim == 3): w_t = w_t.reshape((state_below.shape[0], state_below.shape[1])) w_t = w_t.dimshuffle(0, 1, 'x') elif (state_below.ndim != w_t.ndim and w_t.ndim == 1 and state_below.ndim == 2): w_t = w_t.reshape((state_below.shape[0],)) w_t = w_t.dimshuffle(0, 'x') if w_t.ndim == 2: w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6)) elif w_t.ndim == 3: w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6)) else: w_t = w_tc return [w_t], [new_pre_weights]