def fprop(self, state_before, mem_before, cell_before, forget_below, input_below, output_below, state_below): state_fork_outs = self.state_before_fork_layer.fprop(state_before) mem_fork_outs = self.mem_before_fork_layer.fprop(mem_before) inp = Sigmoid(input_below + mem_fork_outs[self.mbf_names[1]] + \ state_fork_outs[self.sbf_names[1]]) output = Sigmoid(output_below + mem_fork_outs[self.mbf_names[2]] + \ state_fork_outs[self.sbf_names[2]]) forget = Sigmoid(forget_below + mem_fork_outs[self.mbf_names[0]] + \ state_fork_outs[self.sbf_names[0]]) cell = Tanh(state_below + mem_fork_outs[self.mbf_names[3]] + state_fork_outs[self.sbf_names[3]]) c_t = inp * cell + forget * cell_before h_t = output * self.activ(c_t) return h_t, c_t
def _step(mask, sbelow, sbefore, cell_before): preact = dot(sbefore, param('U')) preact += sbelow preact += tparams[prfx(prefix, 'b')] f = Sigmoid(_slice(preact, 0, dim)) o = Sigmoid(_slice(preact, 1, dim)) c = Tanh(_slice(preact, 2, dim)) c = f * cell_before + (1 - f) * c c = mask * c + (1. - mask) * cell_before h = o * tensor.tanh(c) h = mask * h + (1. - mask) * sbefore return h, c
def _step(mask, sbelow, sbefore, cell_before, *args): preact = dot(sbefore, param('U')) preact += sbelow preact += param('b') i = Sigmoid(_slice(preact, 0, dim)) f = Sigmoid(_slice(preact, 1, dim)) o = Sigmoid(_slice(preact, 2, dim)) c = Tanh(_slice(preact, 3, dim)) c = f * cell_before + i * c c = mask * c + (1. - mask) * cell_before h = o * tensor.tanh(c) h = mask * h + (1. - mask) * sbefore return h, c
def _step_slice(mask, sbelow, sbelowx, sbefore, U, Ux): preact = dot(sbefore, U) preact += sbelow r = Sigmoid(_slice(preact, 0, dim)) u = Sigmoid(_slice(preact, 1, dim)) preactx = dot(r * sbefore, Ux) # preactx = preactx preactx = preactx + sbelowx h = Tanh(preactx) h = u * sbefore + (1. - u) * h h = mask[:, None] * h + (1. - mask)[:, None] * sbefore return h
def fprop(self, state_before, mem_before, reset_below, gater_below, state_below, context=None, use_noise=None): state_reset = self.state_reset_before_proj.fprop(state_before) state_gater = self.state_gater_before_proj.fprop(state_before) membf_state = self.state_mem_before_proj.fprop(mem_before) if self.use_layer_norm: state_reset = self.reset_layer_norm_bf.fprop(state_reset) state_gater = self.update_layer_norm_bf.fprop(state_gater) membf_state = self.mem_layer_norm_bf.fprop(membf_state) reset_below = self.reset_layer_norm_inp.fprop(reset_below) gater_below = self.update_layer_norm_inp.fprop(gater_below) state_below = self.ht_layer_norm_inp.fprop(state_below) reset = Sigmoid(reset_below + state_reset) state_state = self.state_str_before_proj.fprop(reset * state_before) if self.use_layer_norm: state_state = self.ht_layer_norm_bf.fprop(state_state) gater = Sigmoid(gater_below + state_gater) if context: h = self.activ(state_state + membf_state + state_below + context) else: h = self.activ(state_state + membf_state + state_below) if self.dropout: h = self.dropout(h, use_noise=use_noise) h_t = (1. - gater) * state_before + gater * h return h_t
def _step_slice(mask, sbelow, sbelowx, xc_, sbefore, ctx_, alpha_, pctx_, cc_, U, Wc, Wd_att, U_att, c_tt, Ux, Wcx): # attention pstate_ = dot(sbefore, Wd_att) pctx__ = pctx_ + pstate_[None, :, :] pctx__ += xc_ pctx__ = Tanh(pctx__) alpha = dot(pctx__, U_att)+c_tt alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) alpha = tensor.exp(alpha) if context_mask: alpha = alpha * context_mask alpha = alpha / alpha.sum(0, keepdims=True) ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context preact = dot(sbefore, U) preact += sbelow preact += dot(ctx_, Wc) preact = Sigmoid(preact) r = _slice(preact, 0, dim) u = _slice(preact, 1, dim) preactx = dot(sbefore, Ux) preactx *= r preactx += sbelowx preactx += dot(ctx_, Wcx) h = Tanh(preactx) h = u * sbefore + (1. - u) * h h = mask[:, None] * h + (1. - mask)[:, None] * sbefore return h, ctx_, alpha.T
def NReLU(x, rng=None, use_noise=False): assert rng is not None if use_noise: stds = Sigmoid(x) x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype) return Trect(x)
def fprop(self, state_below, memory, w_t_before, w_t_pre_before=None, time_idxs=None): if time_idxs is None: logger.info("Time indices are empty!") time_idxs = self.time_idxs fork_outs = self.state_fork_layer.fprop(state_below) idx = 0 # First things first, content based addressing: if not self.use_local_att: beta_pre = fork_outs[self.names[0]] beta = TT.nnet.softplus(beta_pre).reshape((beta_pre.shape[0],)) if (state_below.ndim != beta.ndim and beta.ndim == 2 and state_below.ndim == 3): beta = beta.reshape((state_below.shape[0], state_below.shape[1])) elif (state_below.ndim != beta.ndim and beta.ndim == 1 and state_below.ndim == 2): beta = beta.reshape((state_below.shape[0],)) else: raise ValueError("Unknown shape for beta!") beta = TT.shape_padright(beta) idx = 1 key_pre = fork_outs[self.names[idx]] idx += 1 key_t = key_pre sim_vals = self.mem_similarity(key_t, memory) weights = sim_vals new_pre_weights = None if self.smoothed_diff_weights: dw_scaler = fork_outs[self.names[idx]] dw_scaler = TT.addbroadcast(dw_scaler, 1) weights = sim_vals - Sigmoid(dw_scaler) * w_t_pre_before new_pre_weights = self.mem_weight_decay * sim_vals + (1 - \ self.mem_weight_decay) * w_t_pre_before idx += 1 std = 5 """ if self.use_local_att: mean = as_floatX(self.mem_nel) * Sigmoid(weights*self.mean_pred.fprop(state_below)) exp_ws = -(time_idxs - mean)**2 / (2.0 * std) weights = exp_ws * weights """ if self.use_local_att: w_tc = softmax3(weights) if weights.ndim == 3 else TT.nnet.softmax(weights) else: if weights.ndim == 3 and beta.ndim == 2: beta = beta.dimshuffle('x', 0, 1) w_tc = softmax3(weights * beta) else: # Content based weights: w_tc = TT.nnet.softmax(weights * beta) if self.use_local_att: first_loc_layer = Tanh(self.state_below_local.fprop(state_below) +\ self.weights_below_local.fprop(weights)) mean = as_floatX(self.mem_nel) * Sigmoid(self.mean_pred.fprop(first_loc_layer)) mean = TT.addbroadcast(mean, 1) exp_ws = TT.exp(-((time_idxs - mean)**2) / (2.0 * std)) w_tc = exp_ws * w_tc w_tc = w_tc / w_tc.sum(axis=1, keepdims=True) if self.use_loc_based_addressing: # Location based addressing: g_t_pre = fork_outs[self.names[idx]] g_t = Sigmoid(g_t_pre).reshape((g_t_pre.shape[0],)) if (state_below.ndim != g_t.ndim and g_t.ndim == 2 and state_below.ndim == 3): g_t = g_t.reshape((state_below.shape[0], state_below.shape[1])) elif (state_below.ndim != g_t.ndim and g_t.ndim == 1 and state_below.ndim == 2): g_t = g_t.reshape((state_below.shape[0],)) else: raise ValueError("Unknown shape for g_t!") g_t = TT.shape_padright(g_t) w_tg = g_t * w_tc + (1 - g_t) * w_t_before shifts_pre = fork_outs[self.names[idx + 1]] if shifts_pre.ndim == 2: if self.use_multiscale_shifts: if self.use_scale_layer: scales = TT.exp(self.scale_layer.fprop(state_below)) scales = scales.dimshuffle(0, 'x', 1) else: scales = TT.exp(TT.arange(self.scale_size).dimshuffle('x', 'x', 0)) shifts_pre = shifts_pre.reshape((state_below.shape[0], -1, self.scale_size)) shifts_pre = (shifts_pre * scales).sum(-1) if self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1)) elif self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1)) else: shifts_pre = shifts_pre.reshape( (state_below.shape[0], self.mem_nel)) if state_below.ndim == 3: shifts_pre = shifts_pre.dimshuffle(0, 1, 'x') shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True).dimshuffle(0, 'x', 'x') else: shifts_pre = shifts_pre.dimshuffle(0, 1) shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True) shifts_pre = shifts_pre.dimshuffle(0, 1, 'x') elif shifts_pre.ndim == 1: if self.use_multiscale_shifts: if self.use_scale_layer: scales = TT.exp(self.scale_layer.fprop(state_below)) else: scales = TT.exp(TT.arange(self.scale_size)) shifts_pre = shifts_pre.reshape((-1, self.scale_size)) shifts_pre = (shifts_pre * scales).sum(-1) if self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1)) if self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, 1)) elif self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, 1)) else: shifts_pre = shifts_pre.reshape((self.mem_nel,)) if state_below.ndim == 2: shifts_pre = TT.shape_padright(shifts_pre) shifts_pre = shifts_pre - shifts_pre.max(0, keepdims=True) shifts = TT.exp(shifts_pre) if shifts.ndim == 2: shifts = shifts / shifts.sum(axis=0, keepdims=True) elif shifts.ndim == 3: shifts = shifts / shifts.sum(axis=1, keepdims=True) CC = CircularConvolveAdvIndexing if self.use_adv_indexing else\ CircularConvolve w_t_hat = CC()(weights=w_tg, shifts=shifts, mem_size=self.mem_nel, shift_width=self.shift_width) if self.use_reinforce: if w_t_hat.ndim == 2: w_t = TT.nnet.softmax(w_t_hat) elif w_t_hat.ndim == 3: w_t = softmax3(w_t_hat) else: gamma_pre = fork_outs[self.names[4]] assert w_t_hat.ndim == gamma_pre.ndim, ("The number of dimensions for " " w_t_hat and gamma_pre should " " be the same") if gamma_pre.ndim == 1: gamma_pre = gamma_pre else: gamma_pre = gamma_pre.reshape((gamma_pre.shape[0],)) gamma_pre = TT.shape_padright(gamma_pre) gamma = TT.nnet.softplus(gamma_pre) + const(1) w_t = (abs(w_t_hat + const(1e-16))**gamma) + const(1e-42) if (state_below.ndim != shifts_pre.ndim and w_t.ndim == 2 and state_below.ndim == 3): w_t = w_t.reshape((state_below.shape[0], state_below.shape[1])) w_t = w_t.dimshuffle(0, 1, 'x') elif (state_below.ndim != w_t.ndim and w_t.ndim == 1 and state_below.ndim == 2): w_t = w_t.reshape((state_below.shape[0],)) w_t = w_t.dimshuffle(0, 'x') if w_t.ndim == 2: w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6)) elif w_t.ndim == 3: w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6)) else: w_t = w_tc return [w_t], [new_pre_weights]
def fprop(self, inps=None, leak_rate=0.05, use_noise=False, mdl_name=None): self.build_model(use_noise=use_noise, mdl_name=mdl_name) self.ntm.evaluation_mode = use_noise if not inps: inps = self.inps # First two are X and targets # assert (2 + sum([use_mask, use_cmask])) + 1 >= len(inps), \ # "inputs have illegal shape." cmask = None mask = None if isinstance(inps, list): X = inps[0] y = inps[1] if self.use_mask: mask = inps[2] if self.use_cost_mask: cmask = inps[3] else: X = inps['X'] y = inps['y'] if self.use_mask: mask = inps['mask'] if self.use_cost_mask: cmask = inps['cmask'] if self.use_cost_mask: if cmask is not None: if self.use_bow_cost_mask: if mask.ndim == cmask.ndim: m = (mask * TT.eq(cmask, 0)).reshape( (cmask.shape[0] * cmask.shape[1], -1)) else: m = (mask.dimshuffle(0, 1, 'x') * TT.eq(cmask, 0))[:, :, 0].reshape( (cmask.shape[0] * cmask.shape[1], -1)) else: m = mask else: raise ValueError("Mask for the answers should not be empty.") if X.ndim == 2 and y.ndim == 1: # For sequential MNIST. if self.permute_order: X = X.dimshuffle(1, 0) idxs = self.rnd_indxs X = X[idxs] inp_shp = (X.shape[0], X.shape[1], -1) else: inp_shp = (X.shape[1], X.shape[2], -1) #import pdb;pdb.set_trace() self.ntm_in = None if self.use_bow_input and not self.use_gru_inp_rep and not self.use_simple_rnn_inp_rep: bow_out = self.bow_layer.fprop(X, amask=m, deterministic=not use_noise) bow_out = bow_out.reshape((X.shape[1], X.shape[2], -1)) self.ntm_in = bow_out elif self.use_gru_inp_rep: m0 = as_floatX(TT.gt(X, 0)) if self.use_mask and self.use_cost_mask: if cmask is not None: m1 = mask * TT.eq(cmask, 0) else: raise ValueError( "Mask for the answers should not be empty.") low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1) Xr = X.reshape(low_inp_shp) grufact_inps = self.gru_fact_layer_inps.fprop(Xr) low_reset_below = grufact_inps.values()[0].reshape(low_inp_shp) low_gater_below = grufact_inps.values()[1].reshape(low_inp_shp) low_state_below = grufact_inps.values()[2].reshape(low_inp_shp) linps = [low_reset_below, low_gater_below, low_state_below] m0_part = TT.cast( m0.sum(0).reshape( (X.shape[1], X.shape[2])).dimshuffle(0, 1, 'x'), 'float32') m0_part = TT.switch(TT.eq(m0_part, as_floatX(0)), as_floatX(1), m0_part) h0 = self.gru_fact_layer.fprop(inps=linps, mask=m0, batch_size=self.batch_size) self.ntm_in = m1.dimshuffle(0, 1, 'x') * ((m0.dimshuffle(0, 1, 2, 'x') * h0.reshape((X.shape[0], X.shape[1], X.shape[2], -1))).sum(0) \ / m0_part).reshape(inp_shp) elif self.use_simple_rnn_inp_rep: m0 = as_floatX(TT.gt(X, 0)) if cmask is not None: m1 = mask * TT.eq(cmask, 0) else: raise ValueError("Mask for the answers should not be empty.") low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1) Xr = X.reshape(low_inp_shp) rnnfact_inps = self.rnn_fact_layer_inps.fprop(Xr).reshape( low_inp_shp) m0 = m0.reshape(low_inp_shp) h0 = self.rnn_fact_layer.fprop(inps=rnnfact_inps, mask=m0, batch_size=self.batch_size) m0_part = TT.cast( m0.sum(0).reshape( (X.shape[1], X.shape[2])).dimshuffle(0, 1, 'x'), 'float32') m0_part = TT.switch(m0_part == 0, as_floatX(1), m0_part) self.ntm_in = m1.dimshuffle(0, 1, 'x') * (h0.reshape((X.shape[0], X.shape[1], X.shape[2], -1)).sum(0) / \ m0_part).reshape(inp_shp) else: X_proj = self.inp_proj_layer.fprop(X) if not self.learn_embeds: X_proj = block_gradient(X_proj) if self.use_batch_norm: X_proj = self.batch_norm_layer.fprop(X_proj, inference=not use_noise) self.ntm_in = X_proj context = None if self.use_context: if self.use_qmask: context = (self.qmask.dimshuffle(0, 1, 'x') * self.ntm_in).sum(0) else: m1_part = m1.sum(0).dimshuffle(0, 'x') context = self.ntm_in.sum(0) / m1_part self.ntm_outs = self.ntm.fprop(self.ntm_in, mask=mask, cmask=cmask, context=context, batch_size=self.batch_size, use_mask=self.use_mask, use_noise=not use_noise) h, m_read = self.ntm_outs[0], self.ntm_outs[2] if self.use_reinforce: self.w_samples, self.r_samples = self.ntm_outs[-2], self.ntm_outs[ -1] if self.smoothed_diff_weights: idx = -6 else: idx = -4 self.write_weights, self.read_weights = self.ntm_outs[idx], \ self.ntm_outs[idx+1] else: self.write_weights, self.read_weights = self.ntm_outs[ 3], self.ntm_outs[4] if self.anticorrelation: acorr = AntiCorrelationConstraint(level=self.anticorrelation) rw1 = self.read_weights[:, 0] rw2 = self.read_weights[:, 1] self.reg += acorr(rw1, rw2, mask=mask) if self.correlation_ws: logger.info("Applying the correlation constraint.") corr_cons = CorrelationConstraint(level=self.correlation_ws) self.reg += corr_cons(self.read_weights, self.write_weights, mask, self.qmask) if self.use_last_hidden_state: h = h.reshape(inp_shp) h = h[-1] if self.use_deepout: merged_out = self.merge_layer.fprop([h, m_read]) out_layer = Leaky_Rect(merged_out, leak_rate) if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) out_layer = dropOp(out_layer, deterministic=not use_noise) out_layer = self.out_layer.fprop(out_layer, deterministic=not use_noise) else: if self.use_out_mem: if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) m_read = dropOp(m_read, deterministic=not use_noise) mem_out = self.out_mem.fprop(m_read, deterministic=not use_noise) mem_scaler = self.out_scaler.fprop( h, deterministic=not use_noise).reshape( (mem_out.shape[0], )).dimshuffle(0, 'x') h_out = self.out_layer.fprop(h, deterministic=not use_noise) out_layer = h_out + mem_out * Sigmoid(mem_scaler) else: if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) h = dropOp(h, deterministic=not use_noise) out_layer = self.out_layer.fprop(h, deterministic=not use_noise) if self.predict_bow_out and self.bow_out_layer: logger.info("Using the bow output prediction.") self.bow_pred_out = Sigmoid( self.bow_out_layer.fprop(h, deterministic=not use_noise)) if self.softmax: self.probs = Softmax(out_layer) else: self.probs = Sigmoid(out_layer) if self.ntm.updates: self.updates.update(self.ntm.updates) self.str_params(logger) self.h = h return self.probs, self.ntm_outs