def init_params(self): self.additional_layers = [] if self.n_layers > 1: for i in xrange(1, self.n_layers): self.additional_layers += [ AffineLayer(n_in=self.n_hids, n_out=self.n_hids, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("ff_cont_proj_%d" % i))] if self.noisy: mpname = self.pname("ff_controller_p_vals") self.params[mpname] = np.random.uniform(-1.0, 1.0, (self.n_layers, self.n_hids)).astype("float32") self.pvals = self.params[mpname] self.mem_before_p = AffineLayer(n_in=self.mem_size, n_out=self.n_hids, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("mem_before_p")) self.children = [self.mem_before_p] + self.additional_layers self.merge_params() self.str_params()
def build_model(self, use_noise=False, mdl_name=None): self.bowin_layer = BOWLayer(n_in=self.n_in, n_out=self.emb_size, noise=self.noise, weight_initializer=self.wight_initializer, bias_initializer=self.bias_initializer, seq_len=self.seq_len, name=self.pname("bowin_layer")) self.bowout_layer = BOWLayer(n_in=self.n_in, n_out=self.emb_size, noise=self.noise, weight_initializer=self.wight_initializer, bias_initializer=self.bias_initializer, seq_len=self.seq_len, name=self.pname("bowout_layer")) self.qembed_layer = BOWLayer(n_in=self.n_in, n_out=self.emb_size, noise=self.noise, weight_initializer=self.wight_initializer, bias_initializer=self.bias_initializer, seq_len=self.seq_len, name=self.pname("qembed_layer")) if not self.out_layer: self.out_layer = AffineLayer( n_in=self.out_layer_in, n_out=self.n_out, noise=self.noise, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("out_layer")) if not self.children: self.children.append(self.bowin_layer) self.children.append(self.bowout_layer) self.children.append(self.qembed_layer) self.children.append(self.out_layer) self.merge_params() if mdl_name: logger.info("Reloading the model from %s. " % mdl_name) self.params.load(mdl_name) [child.use_params(self.params) for child in self.children]
class Addresser(Layer): """ An addressing Layer. """ def __init__(self, n_hids=None, mem_size=None, mem_nel=None, address_size=None, mem_gater_activ=None, n_mid_key_size=None, scale_size=None, use_scale_layer=True, smoothed_diff_weights=False, use_local_att=False, mem_weight_decay=0.96, read_head=False, use_loc_based_addressing=True, shift_width=3, scale_bias_coef=1.0, use_adv_indexing=False, use_multiscale_shifts=True, use_geom_sig_dot=False, use_reinforce=False, weight_initializer=None, bias_initializer=None, name="nmt_addresser"): super(Addresser, self).__init__() self.n_hids = n_hids self.n_mid_key_size = n_mid_key_size self.mem_size = mem_size self.mem_nel = mem_nel self.use_reinforce = use_reinforce self.read_head = read_head self.scale_size = scale_size self.scale_bias_coef = scale_bias_coef self.address_size = address_size self.use_scale_layer = use_scale_layer self.use_adv_indexing = use_adv_indexing self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.name = name self.use_loc_based_addressing = use_loc_based_addressing self.use_multiscale_shifts = use_multiscale_shifts self.shift_width = shift_width self.smoothed_diff_weights = smoothed_diff_weights self.mem_weight_decay = mem_weight_decay self.use_local_att = use_local_att if self.use_local_att: self.time_idxs = const(as_floatX(np.arange(self.mem_nel))) self.time_idxs.name = "time_idxs" if self.use_adv_indexing: print "Using the advanced indexing." else: print "Not using the advanced indexing." if mem_gater_activ: self.mem_gater_activ = mem_gater_activ else: self.mem_gater_activ = Sigmoid if use_geom_sig_dot: self.mem_similarity = GeomEuclideanSigmoidDot() else: self.mem_similarity = MemorySimilarity() self.init_params() def init_params(self): if not self.use_local_att: names = ["fork_state_beta_t", "fork_state_key_t"] self.n_outs = [1, self.mem_size + self.address_size] else: names = ["fork_state_key_t"] self.n_outs = [self.mem_size + self.address_size] self.shift_size = self.mem_nel if self.use_multiscale_shifts: logger.info("Using the multiscale shifts.") if self.scale_size is None or self.scale_size < -1: self.scale_size = int(np.floor(np.log(self.mem_nel))) logger.info("Size of the scales is %d" % self.scale_size) self.shift_size = self.shift_width * self.scale_size binit_vals = [None, None] if self.smoothed_diff_weights: names.append("fork_state_diff_gate") self.n_outs += [1] binit_vals += [-0.16] if self.use_loc_based_addressing: names += [ "fork_state_gater_t", "fork_state_shift_hat_t" ] self.n_outs += [1, self.shift_size] binit_vals += [None, None] if not self.use_reinforce: names += [ "fork_state_sharpen_hat_t" ] self.n_outs += [1] binit_vals += [0.001] if self.use_scale_layer: self.scale_layer = AffineLayer(n_in=self.n_hids, n_out=self.scale_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=True, name=self.pname("scale_layer")) pname = self.scale_layer.params.getparamname("bias") arng = as_floatX(np.arange(self.scale_size)) arng = arng / arng.sum() self.scale_layer.params[pname] = self.scale_bias_coef * arng self.children.extend([self.scale_layer]) if self.use_local_att: bott_size = self.n_hids logger.info("Using the local attention.") self.state_below_local = AffineLayer(n_in=self.n_hids, n_out=bott_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=True, name=self.pname("state_below_loc_layer")) self.weights_below_local = AffineLayer(n_in=self.mem_nel, n_out=bott_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=False, name=self.pname("weights_loc_layer")) self.mean_pred = AffineLayer(n_in=bott_size, n_out=1, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=True, name=self.pname("mean_pred")) self.children.extend([self.state_below_local, self.weights_below_local, self.mean_pred]) names = map(lambda x: self.pname(x), names) self.names = names self.state_fork_layer = ForkLayer(n_in=self.n_hids, n_outs=self.n_outs, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, init_bias_vals = binit_vals, names=names) self.children.extend([self.state_fork_layer]) self.powerup_layer = None self.merge_params() def fprop(self, state_below, memory, w_t_before, w_t_pre_before=None, time_idxs=None): if time_idxs is None: logger.info("Time indices are empty!") time_idxs = self.time_idxs fork_outs = self.state_fork_layer.fprop(state_below) idx = 0 # First things first, content based addressing: if not self.use_local_att: beta_pre = fork_outs[self.names[0]] beta = TT.nnet.softplus(beta_pre).reshape((beta_pre.shape[0],)) if (state_below.ndim != beta.ndim and beta.ndim == 2 and state_below.ndim == 3): beta = beta.reshape((state_below.shape[0], state_below.shape[1])) elif (state_below.ndim != beta.ndim and beta.ndim == 1 and state_below.ndim == 2): beta = beta.reshape((state_below.shape[0],)) else: raise ValueError("Unknown shape for beta!") beta = TT.shape_padright(beta) idx = 1 key_pre = fork_outs[self.names[idx]] idx += 1 key_t = key_pre sim_vals = self.mem_similarity(key_t, memory) weights = sim_vals new_pre_weights = None if self.smoothed_diff_weights: dw_scaler = fork_outs[self.names[idx]] dw_scaler = TT.addbroadcast(dw_scaler, 1) weights = sim_vals - Sigmoid(dw_scaler) * w_t_pre_before new_pre_weights = self.mem_weight_decay * sim_vals + (1 - \ self.mem_weight_decay) * w_t_pre_before idx += 1 std = 5 """ if self.use_local_att: mean = as_floatX(self.mem_nel) * Sigmoid(weights*self.mean_pred.fprop(state_below)) exp_ws = -(time_idxs - mean)**2 / (2.0 * std) weights = exp_ws * weights """ if self.use_local_att: w_tc = softmax3(weights) if weights.ndim == 3 else TT.nnet.softmax(weights) else: if weights.ndim == 3 and beta.ndim == 2: beta = beta.dimshuffle('x', 0, 1) w_tc = softmax3(weights * beta) else: # Content based weights: w_tc = TT.nnet.softmax(weights * beta) if self.use_local_att: first_loc_layer = Tanh(self.state_below_local.fprop(state_below) +\ self.weights_below_local.fprop(weights)) mean = as_floatX(self.mem_nel) * Sigmoid(self.mean_pred.fprop(first_loc_layer)) mean = TT.addbroadcast(mean, 1) exp_ws = TT.exp(-((time_idxs - mean)**2) / (2.0 * std)) w_tc = exp_ws * w_tc w_tc = w_tc / w_tc.sum(axis=1, keepdims=True) if self.use_loc_based_addressing: # Location based addressing: g_t_pre = fork_outs[self.names[idx]] g_t = Sigmoid(g_t_pre).reshape((g_t_pre.shape[0],)) if (state_below.ndim != g_t.ndim and g_t.ndim == 2 and state_below.ndim == 3): g_t = g_t.reshape((state_below.shape[0], state_below.shape[1])) elif (state_below.ndim != g_t.ndim and g_t.ndim == 1 and state_below.ndim == 2): g_t = g_t.reshape((state_below.shape[0],)) else: raise ValueError("Unknown shape for g_t!") g_t = TT.shape_padright(g_t) w_tg = g_t * w_tc + (1 - g_t) * w_t_before shifts_pre = fork_outs[self.names[idx + 1]] if shifts_pre.ndim == 2: if self.use_multiscale_shifts: if self.use_scale_layer: scales = TT.exp(self.scale_layer.fprop(state_below)) scales = scales.dimshuffle(0, 'x', 1) else: scales = TT.exp(TT.arange(self.scale_size).dimshuffle('x', 'x', 0)) shifts_pre = shifts_pre.reshape((state_below.shape[0], -1, self.scale_size)) shifts_pre = (shifts_pre * scales).sum(-1) if self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1)) elif self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1)) else: shifts_pre = shifts_pre.reshape( (state_below.shape[0], self.mem_nel)) if state_below.ndim == 3: shifts_pre = shifts_pre.dimshuffle(0, 1, 'x') shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True).dimshuffle(0, 'x', 'x') else: shifts_pre = shifts_pre.dimshuffle(0, 1) shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True) shifts_pre = shifts_pre.dimshuffle(0, 1, 'x') elif shifts_pre.ndim == 1: if self.use_multiscale_shifts: if self.use_scale_layer: scales = TT.exp(self.scale_layer.fprop(state_below)) else: scales = TT.exp(TT.arange(self.scale_size)) shifts_pre = shifts_pre.reshape((-1, self.scale_size)) shifts_pre = (shifts_pre * scales).sum(-1) if self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1)) if self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, 1)) elif self.shift_width >= 0: shifts_pre = shifts_pre.reshape((-1, 1)) else: shifts_pre = shifts_pre.reshape((self.mem_nel,)) if state_below.ndim == 2: shifts_pre = TT.shape_padright(shifts_pre) shifts_pre = shifts_pre - shifts_pre.max(0, keepdims=True) shifts = TT.exp(shifts_pre) if shifts.ndim == 2: shifts = shifts / shifts.sum(axis=0, keepdims=True) elif shifts.ndim == 3: shifts = shifts / shifts.sum(axis=1, keepdims=True) CC = CircularConvolveAdvIndexing if self.use_adv_indexing else\ CircularConvolve w_t_hat = CC()(weights=w_tg, shifts=shifts, mem_size=self.mem_nel, shift_width=self.shift_width) if self.use_reinforce: if w_t_hat.ndim == 2: w_t = TT.nnet.softmax(w_t_hat) elif w_t_hat.ndim == 3: w_t = softmax3(w_t_hat) else: gamma_pre = fork_outs[self.names[4]] assert w_t_hat.ndim == gamma_pre.ndim, ("The number of dimensions for " " w_t_hat and gamma_pre should " " be the same") if gamma_pre.ndim == 1: gamma_pre = gamma_pre else: gamma_pre = gamma_pre.reshape((gamma_pre.shape[0],)) gamma_pre = TT.shape_padright(gamma_pre) gamma = TT.nnet.softplus(gamma_pre) + const(1) w_t = (abs(w_t_hat + const(1e-16))**gamma) + const(1e-42) if (state_below.ndim != shifts_pre.ndim and w_t.ndim == 2 and state_below.ndim == 3): w_t = w_t.reshape((state_below.shape[0], state_below.shape[1])) w_t = w_t.dimshuffle(0, 1, 'x') elif (state_below.ndim != w_t.ndim and w_t.ndim == 1 and state_below.ndim == 2): w_t = w_t.reshape((state_below.shape[0],)) w_t = w_t.dimshuffle(0, 'x') if w_t.ndim == 2: w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6)) elif w_t.ndim == 3: w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6)) else: w_t = w_tc return [w_t], [new_pre_weights]
def init_params(self): if not self.use_local_att: names = ["fork_state_beta_t", "fork_state_key_t"] self.n_outs = [1, self.mem_size + self.address_size] else: names = ["fork_state_key_t"] self.n_outs = [self.mem_size + self.address_size] self.shift_size = self.mem_nel if self.use_multiscale_shifts: logger.info("Using the multiscale shifts.") if self.scale_size is None or self.scale_size < -1: self.scale_size = int(np.floor(np.log(self.mem_nel))) logger.info("Size of the scales is %d" % self.scale_size) self.shift_size = self.shift_width * self.scale_size binit_vals = [None, None] if self.smoothed_diff_weights: names.append("fork_state_diff_gate") self.n_outs += [1] binit_vals += [-0.16] if self.use_loc_based_addressing: names += [ "fork_state_gater_t", "fork_state_shift_hat_t" ] self.n_outs += [1, self.shift_size] binit_vals += [None, None] if not self.use_reinforce: names += [ "fork_state_sharpen_hat_t" ] self.n_outs += [1] binit_vals += [0.001] if self.use_scale_layer: self.scale_layer = AffineLayer(n_in=self.n_hids, n_out=self.scale_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=True, name=self.pname("scale_layer")) pname = self.scale_layer.params.getparamname("bias") arng = as_floatX(np.arange(self.scale_size)) arng = arng / arng.sum() self.scale_layer.params[pname] = self.scale_bias_coef * arng self.children.extend([self.scale_layer]) if self.use_local_att: bott_size = self.n_hids logger.info("Using the local attention.") self.state_below_local = AffineLayer(n_in=self.n_hids, n_out=bott_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=True, name=self.pname("state_below_loc_layer")) self.weights_below_local = AffineLayer(n_in=self.mem_nel, n_out=bott_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=False, name=self.pname("weights_loc_layer")) self.mean_pred = AffineLayer(n_in=bott_size, n_out=1, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=True, name=self.pname("mean_pred")) self.children.extend([self.state_below_local, self.weights_below_local, self.mean_pred]) names = map(lambda x: self.pname(x), names) self.names = names self.state_fork_layer = ForkLayer(n_in=self.n_hids, n_outs=self.n_outs, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, init_bias_vals = binit_vals, names=names) self.children.extend([self.state_fork_layer]) self.powerup_layer = None self.merge_params()
class WeaklySupervisedMemoryNetwork(Model): """ This is a class for weakly supervised memory network. """ def __init__(self, n_in, n_hids, low_gru_size, n_out, inps=None, n_layers=None, dropout=None, seq_len=None, learning_rule=None, weight_initializer=None, bias_initializer=None, activ=None, use_cost_mask=True, noise=False, use_hint_layer=False, use_average=False, theano_function_mode=None, use_positional_encoding=False, use_inv_cost_mask=False, batch_size=32, use_noise=False, name=None): self.n_in = n_in self.n_hids = n_hids self.n_out = n_out self.low_gru_size = low_gru_size self.n_layers = n_layers self.inps = inps self.noise = noise self.seq_len = seq_len self.use_cost_mask = use_cost_mask selfearning_rule = learning_rule self.dropout = dropout self.use_average = use_average self.batch_size = batch_size self.use_noise = use_noise self.train_timer = Timer("Training function") self.grads_timer = Timer("Computing the grads") self.theano_function_mode = theano_function_mode self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.use_average = use_average self.use_positional_encoding = use_positional_encoding self.use_inv_cost_mask = use_inv_cost_mask self.eps = 1e-8 self.activ = activ self.out_layer_in = self.n_hids if name is None: raise ValueError("name should not be empty.") self.reset() self.name = name def reset(self): self.children = [] self.params = Parameters() self.grulow_layer = None self.low_gru_layer = None self.gruup_layer = None self.gru_layer = None self.out_layer = None self.hint_layer = None self.bow_input = None self.bow_output = None self.updates = OrderedDict({}) def build_model(self, use_noise=False, mdl_name=None): self.bowin_layer = BOWLayer(n_in=self.n_in, n_out=self.emb_size, noise=self.noise, weight_initializer=self.wight_initializer, bias_initializer=self.bias_initializer, seq_len=self.seq_len, name=self.pname("bowin_layer")) self.bowout_layer = BOWLayer(n_in=self.n_in, n_out=self.emb_size, noise=self.noise, weight_initializer=self.wight_initializer, bias_initializer=self.bias_initializer, seq_len=self.seq_len, name=self.pname("bowout_layer")) self.qembed_layer = BOWLayer(n_in=self.n_in, n_out=self.emb_size, noise=self.noise, weight_initializer=self.wight_initializer, bias_initializer=self.bias_initializer, seq_len=self.seq_len, name=self.pname("qembed_layer")) if not self.out_layer: self.out_layer = AffineLayer( n_in=self.out_layer_in, n_out=self.n_out, noise=self.noise, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("out_layer")) if not self.children: self.children.append(self.bowin_layer) self.children.append(self.bowout_layer) self.children.append(self.qembed_layer) self.children.append(self.out_layer) self.merge_params() if mdl_name: logger.info("Reloading the model from %s. " % mdl_name) self.params.load(mdl_name) [child.use_params(self.params) for child in self.children] def get_cost(self, use_noise=False, mdl_name=None): probs, _ = self.fprop(use_noise=use_noise, mdl_name=mdl_name) y = self.inps[1] cmask = None if self.use_cost_mask: cmask = self.inps[3] self.cost, self.errors = nll(y, probs, cost_mask=cmask) return self.cost, self.errors def get_train_fn(self, lr=None, mdl_name=None): if lr is None: lr = self.eps cost, errors = self.get_cost(use_noise=self.use_noise, mdl_name=mdl_name) params = self.params.values logger.info("Computing the gradient graph.") self.grads_timer.start() grads = safe_grad(cost, params) gnorm = sum(grad.norm(2) for _, grad in grads.iteritems()) updates, norm_up, param_norm = \ self.learning_rule.get_updates(learning_rate=lr, grads = grads) self.grads_timer.stop() logger.info(self.grads_timer) if not self.updates: self.updates = self.updates.update(updates) outs = [self.cost, gnorm, norm_up, param_norm] outs += [self.errors] train_fn = theano.function(self.inps, outs, updates=updates, mode=self.theano_function_mode, name=self.pname("train_fn")) self.train_timer.stop() logger.info(self.train_timer) return train_fn def get_inspect_fn(self, mdl_name=None): logger.info("Compiling inspect function.") probs, h = self.fprop(use_noise=False, mdl_name=mdl_name) inspect_fn = theano.function([self.inps[0], self.inps[2]], [h, probs], name=self.pname("inspect_fn")) return inspect_fn def get_valid_fn(self, mdl_name=None): logger.info("Compiling validation function.") self.cost, self.errors = self.get_cost(use_noise=False, mdl_name=mdl_name) valid_fn = theano.function(self.inps, [self.cost, self.errors], name=self.pname("valid_fn")) return valid_fn def fprop(self, inps=None, use_mask=True, use_cmask=True, use_noise=False, mdl_name=None): self.build_model(use_noise=use_noise, mdl_name=mdl_name) if not inps: inps = self.inps X = inps[0] if use_mask: mask = inps[2] qmask = inps[3] if use_cmask: cmask = inps[4] assert (3 + sum([use_mask, use_cmask ])) == len(inps), "inputs have illegal shape." m0 = as_floatX(TT.gt(X, 0)) if cmask is not None: m1 = mask * TT.eq(cmask, 0) else: raise ValueError("Mask for the answers should not be empty.") dropOp = None low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1) Xr = X.reshape(low_inp_shp) grulow_inps = self.grulow_layer.fprop(Xr, deterministic=not use_noise) linps = [low_reset_below, low_gater_below, low_state_below] inp_shp = (X.shape[1], X.shape[2], -1) h0 = self.low_gru_layer.fprop(inps=linps, mask=m0, batch_size=self.batch_size) h0 = m1.dimshuffle(0, 1, 'x') * (h0.reshape( (X.shape[0], X.shape[1], X.shape[2], -1))[-1]).reshape(inp_shp) if self.dropout: if dropOp is None: dropOp = Dropout(dropout_prob=self.dropout) h0 = dropOp(h0, deterministic=not use_noise) gruup_inps = self.gruup_layer.fprop(h0, deterministic=not use_noise) reset_below = gruup_inps.values()[0].reshape(inp_shp) gater_below = gruup_inps.values()[1].reshape(inp_shp) state_below = gruup_inps.values()[2].reshape(inp_shp) uinps = [reset_below, gater_below, state_below] h1, _ = self.gru_layer.fprop(inps=uinps, maskf=m1, maskq=qmask, batch_size=self.batch_size) if self.dropout: if dropOp is None: dropOp = Dropout(dropout_prob=self.dropout) h1 = dropOp(h1, deterministic=not use_noise) out_layer = self.out_layer.fprop(h1, deterministic=not use_noise) self.probs = Softmax(out_layer) return self.probs, h1
def build_model(self, use_noise=False, mdl_name=None): if self.use_ff_controller: cls = NTMFFController else: cls = NTM if use_noise: mem_gater_activ = lambda x: self.mem_gater_activ( x, use_noise=use_noise) if self.use_bow_input and not self.bow_layer and not self.use_gru_inp_rep: self.bow_layer = BOWLayer( n_in=self.n_in, n_out=self.bow_size, seq_len=self.max_fact_len, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("bow_layer")) if self.w2v_embed_path: fparams = self.bow_layer.params.lfilterby("weight") self.__init_to_embeds(self.bow_layer, fparams, self.w2v_embeds, scale=self.w2v_embed_scale) elif self.use_gru_inp_rep: if not self.gru_fact_layer_inps: low_cnames = [ "low_reset_below", "low_gater_below", "low_state_below" ] lnfout = len(low_cnames) self.low_cnames = map(lambda x: self.pname(x), low_cnames) self.gru_fact_layer_inps = ForkLayer( n_in=self.n_in, n_outs=tuple([self.bow_size for i in xrange(lnfout)]), weight_initializer=self.weight_initializer, use_bias=False, names=self.low_cnames) if self.w2v_embed_path: fparams = self.gru_fact_layer_inps.params.lfilterby( "weight") self.__init_to_embeds(self.gru_fact_layer_inps, fparams, self.w2v_embeds) if not self.gru_fact_layer: self.gru_fact_layer = GRULayer( n_in=self.bow_size, n_out=self.bow_size, seq_len=self.max_fact_len, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, activ=Tanh, learn_init_state=self.learn_h0, name=self.pname("gru_fact_layer")) elif self.use_simple_rnn_inp_rep: if not self.rnn_fact_layer_inps: self.rnn_fact_layer_inps = AffineLayer( n_in=self.n_in, n_out=self.bow_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("rnn_fact_layer_inps")) if self.w2v_embed_path: fparams = self.rnn_fact_layer_inps.params.lfilterby( "weight") self.__init_to_embeds(self.rnn_fact_layer_inps, fparams, self.w2v_embeds) if not self.rnn_fact_layer: self.rnn_fact_layer = RNNLayer( n_in=self.n_in, n_out=self.bow_size, seq_len=self.max_fact_len, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, activ=Rect, learn_init_state=self.learn_h0, name=self.pname("rnn_fact_layer")) else: if not self.inp_proj_layer: self.inp_proj_layer = AffineLayer( n_in=self.n_in, n_out=self.bow_size, weight_initializer=self.weight_initializer, use_bias=False, bias_initializer=self.bias_initializer, name=self.pname("ntm_inp_proj_layer")) if self.glove_embed_path: fparams = self.inp_proj_layer.params.lfilterby("weight") self.__init_glove_embeds(self.inp_proj_layer, fparams, self.glove_embeds) if self.predict_bow_out and not self.bow_out_layer: self.bow_out_layer = AffineLayer( n_in=self.n_hids, n_out=self.n_out, weight_initializer=self.weight_initializer, noise=self.noise, wpenalty=self.wpenalty, bias_initializer=self.bias_initializer, name=self.pname("bow_out_layer")) if self.use_batch_norm and not self.batch_norm_layer: self.batch_norm_layer = BatchNormLayer( n_in=self.bow_size, n_out=self.bow_size, name=self.pname("batch_norm_inp")) if not self.ntm: inp = self.inps[0] bs = inp.shape[1] if inp.ndim == 4: bs = inp.shape[2] self.ntm = cls( n_in=self.bow_size, n_hids=self.n_hids, l1_pen=self.l1_pen, learn_h0=self.learn_h0, hybrid_att=self.hybrid_att, smoothed_diff_weights=self.smoothed_diff_weights, use_layer_norm=self.use_layer_norm, recurrent_dropout_prob=self.recurrent_dropout_prob, use_bow_input=self.use_bow_input, use_loc_based_addressing=self.use_loc_based_addressing, use_reinforce=self.use_reinforce, erase_activ=self.erase_activ, content_activ=self.content_activ, mem_nel=self.mem_nel, address_size=self.address_size, use_context=self.use_context, n_read_heads=self.n_read_heads, use_soft_att=self.use_soft_att, use_hard_att_eval=self.use_hard_att_eval, use_inp_content=self.use_inp_content, n_write_heads=self.n_write_heads, dice_val=self.dice_val, mem_size=self.mem_size, use_nogru_mem2q=self.use_nogru_mem2q, use_gru_inp_rep=self.use_gru_inp_rep, weight_initializer=self.weight_initializer, use_adv_indexing=self.use_adv_indexing, wpenalty=self.wpenalty, noise=self.noise, n_layers=self.n_layers, bias_initializer=self.bias_initializer, use_quad_interactions=self.use_gate_quad_interactions, controller_activ=self.controller_activ, mem_gater_activ=self.mem_gater_activ, batch_size=self.batch_size if self.batch_size else None, use_multiscale_shifts=self.use_multiscale_shifts, n_reading_steps=self.n_reading_steps, seq_len=self.seq_len, name=self.pname("ntm"), use_noise=use_noise) if not self.merge_layer and self.use_deepout: self.merge_layer = MergeLayer( n_ins=[self.n_hids, self.mem_size], n_out=self.deep_out_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, names=[self.pname("deep_controller"), self.pname("deep_mem")]) if self.use_deepout: out_layer_in = self.deep_out_size else: out_layer_in = self.n_hids if self.use_out_mem: self.out_mem = AffineLayer( n_in=self.mem_size + self.address_size, n_out=self.n_out, weight_initializer=self.weight_initializer, wpenalty=self.wpenalty, noise=self.noise, bias_initializer=self.bias_initializer, name=self.pname("out_mem")) self.out_scaler = AffineLayer( n_in=self.n_hids, n_out=1, weight_initializer=self.weight_initializer, wpenalty=self.wpenalty, noise=self.noise, bias_initializer=self.bias_initializer, name=self.pname("out_scaler")) if not self.out_layer: self.out_layer = AffineLayer( n_in=out_layer_in, n_out=self.n_out, wpenalty=self.wpenalty, noise=self.noise, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("out")) if self.ntm.updates: self.updates.update(self.ntm.updates) if not self.use_reinforce_baseline and self.use_reinforce: self.baseline_out = AffineLayer( n_in=self.n_hids, n_out=1, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, init_bias_val=1e-3, name=self.pname("baseline_out")) if not self.children: self.children.append(self.ntm) if self.use_deepout and self.merge_layer: self.children.append(self.merge_layer) self.children.append(self.out_layer) if self.use_out_mem: self.children.extend([self.out_mem, self.out_scaler]) if self.use_bow_input and self.bow_layer and not self.use_gru_inp_rep: self.children.append(self.bow_layer) elif self.use_gru_inp_rep: self.children.extend( [self.gru_fact_layer_inps, self.gru_fact_layer]) elif self.use_simple_rnn_inp_rep: self.children.extend( [self.rnn_fact_layer_inps, self.rnn_fact_layer]) else: self.children.append(self.inp_proj_layer) if self.predict_bow_out and self.bow_out_layer: self.children.append(self.bow_out_layer) if self.use_reinforce and not self.use_reinforce_baseline: self.children.append(self.baseline_out) if self.use_batch_norm: self.children.append(self.batch_norm_layer) self.merge_params() if self.renormalization_scale: self.params.renormalize_params( nscale=self.renormalization_scale, exclude_params=self.exclude_params) if mdl_name: if os.path.exists(mdl_name): logger.info("Reloading model from %s." % mdl_name) self.params.load(mdl_name) [child.use_params(self.params) for child in self.children] else: warnings.warn( "The model file does not exist and could not load it.") if self.trainpartitioner is None and self.sub_mb_size: self.trainpartitioner = MinibatchGradPartitioner( self.params, self.sub_mb_size, self.batch_size, seq_len=self.seq_len)
class NTMModel(Model): """ NTM model. """ def __init__(self, n_in, n_hids, n_out, mem_size, mem_nel, deep_out_size, bow_size=40, inps=None, dropout=None, predict_bow_out=False, seq_len=None, n_read_heads=1, n_layers=1, n_write_heads=1, train_profile=False, erase_activ=None, content_activ=None, l1_pen=None, l2_pen=None, use_reinforce=False, use_reinforce_baseline=False, n_reading_steps=2, use_gru_inp_rep=False, use_simple_rnn_inp_rep=False, use_nogru_mem2q=False, sub_mb_size=40, lambda1_rein=2e-4, lambda2_rein=2e-5, baseline_reg=1e-2, anticorrelation=None, use_layer_norm=False, recurrent_dropout_prob=-1, correlation_ws=None, hybrid_att=True, max_fact_len=7, use_dice_val=False, use_qmask=False, renormalization_scale=4.8, w2v_embed_scale=0.42, emb_scale=0.32, use_soft_att=False, use_hard_att_eval=False, use_batch_norm=False, learning_rule=None, use_loc_based_addressing=True, smoothed_diff_weights=False, use_multiscale_shifts=True, use_ff_controller=False, use_gate_quad_interactions=False, permute_order=False, wpenalty=None, noise=None, w2v_embed_path=None, glove_embed_path=None, learn_embeds=True, use_last_hidden_state=False, use_adv_indexing=False, use_bow_input=True, use_out_mem=True, use_deepout=True, use_q_mask=False, use_inp_content=True, rnd_indxs=None, address_size=0, learn_h0=False, use_context=False, debug=False, controller_activ=None, mem_gater_activ=None, weight_initializer=None, bias_initializer=None, use_cost_mask=True, use_bow_cost_mask=True, theano_function_mode=None, batch_size=32, use_noise=False, reinforce_decay=0.9, softmax=False, use_mask=False, name="ntm_model", **kwargs): assert deep_out_size is not None, ("Size of the deep output " " should not be None.") if sub_mb_size is None: sub_mb_size = batch_size assert sub_mb_size <= batch_size, "batch_size should be greater than sub_mb_size" self.hybrid_att = hybrid_att self.state = locals() self.use_context = use_context self.eps = 1e-8 self.use_mask = use_mask self.l1_pen = l1_pen self.l2_pen = l2_pen self.l2_penalizer = None self.emb_scale = emb_scale self.w2v_embed_path = w2v_embed_path self.glove_embed_path = glove_embed_path self.learn_embeds = learn_embeds self.exclude_params = {} self.use_gate_quad_interactions = use_gate_quad_interactions self.reinforce_decay = reinforce_decay self.max_fact_len = max_fact_len self.lambda1_reinf = lambda1_rein self.lambda2_reinf = lambda2_rein self.use_reinforce_baseline = use_reinforce_baseline self.use_reinforce = use_reinforce self.use_gru_inp_rep = use_gru_inp_rep self.use_simple_rnn_inp_rep = use_simple_rnn_inp_rep self.use_q_mask = use_q_mask self.use_inp_content = use_inp_content self.rnd_indxs = rnd_indxs self.use_layer_norm = use_layer_norm self.recurrent_dropout_prob = recurrent_dropout_prob self.n_reading_steps = n_reading_steps self.sub_mb_size = sub_mb_size self.predict_bow_out = predict_bow_out self.correlation_ws = correlation_ws self.smoothed_diff_weights = smoothed_diff_weights self.use_soft_att = use_soft_att self.use_hard_att_eval = use_hard_att_eval if anticorrelation and n_read_heads < 2: raise ValueError("Anti-correlation of the attention weight" " do not support the multiple read heads.") self.anticorrelation = anticorrelation if self.predict_bow_out: if len(inps) <= 4: raise ValueError( "The number of inputs should be greater than 4.") if l2_pen: self.l2_penalizer = L2Penalty(self.l2_pen) #assert use_bow_input ^ use_gru_inp_rep ^ self.use_simple_rnn_inp_rep, \ # "You should either use GRU or BOW input." self.renormalization_scale = renormalization_scale self.w2v_embed_scale = w2v_embed_scale self.baseline_reg = baseline_reg self.inps = inps self.erase_activ = erase_activ self.use_ff_controller = use_ff_controller self.content_activ = content_activ self.use_bow_cost_mask = use_bow_cost_mask self.ntm_outs = None self.theano_function_mode = theano_function_mode self.n_in = n_in self.dropout = dropout self.wpenalty = wpenalty self.noise = noise self.bow_size = bow_size self.use_last_hidden_state = use_last_hidden_state self.use_loc_based_addressing = use_loc_based_addressing self.train_profile = train_profile self.use_nogru_mem2q = use_nogru_mem2q self.use_qmask = use_qmask self.permute_order = permute_order self.use_batch_norm = use_batch_norm # Use this if you have a ff-controller because otherwise this is not effective: self.n_layers = n_layers if self.use_reinforce: reinforceCls = REINFORCE if not self.use_reinforce_baseline: reinforceCls = REINFORCEBaselineExt self.Reinforce = reinforceCls(lambda1_reg=self.lambda1_reinf, lambda2_reg=self.lambda2_reinf, decay=self.reinforce_decay) self.ReaderReinforce = \ ReinforcePenalty(reinf_level=self.lambda1_reinf, maxent_level=self.lambda2_reinf, use_reinforce_baseline=self.use_reinforce_baseline) self.dice_val = None if use_dice_val: self.dice_val = sharedX(1.) self.use_dice_val = use_dice_val if bow_size is None: raise ValueError("bow_size should be specified.") if name is None: raise ValueError("name should not be empty.") self.n_hids = n_hids self.mem_size = mem_size self.use_deepout = use_deepout self.mem_nel = mem_nel self.n_out = n_out self.use_out_mem = use_out_mem self.use_multiscale_shifts = use_multiscale_shifts self.address_size = address_size self.n_read_heads = n_read_heads self.n_write_heads = n_write_heads self.learn_h0 = learn_h0 self.use_adv_indexing = use_adv_indexing self.softmax = softmax self.use_bow_input = use_bow_input self.use_cost_mask = use_cost_mask self.deep_out_size = deep_out_size self.controller_activ = controller_activ self.mem_gater_activ = mem_gater_activ self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer if batch_size: self.batch_size = batch_size else: self.batch_size = inps[0].shape[1] #assert self.batch_size >= self.sub_mb_size, ("Minibatch size should be " # " greater than the sub minibatch size") self.comp_grad_fn = None self.name = name self.use_noise = use_noise self.train_timer = Timer("Training function") self.gradfn_timer = Timer("Gradient function") self.grads_timer = Timer("Computing the grads") self.reset() self.seq_len = TT.iscalar('seq_len') self.__convert_inps_to_list() if debug: if self.use_gru_inp_rep or self.use_bow_input: self.seq_len.tag.test_value = self.inps[ 0].tag.test_value.shape[1] else: self.seq_len.tag.test_value = self.inps[ 0].tag.test_value.shape[0] self.learning_rule = learning_rule if self.predict_bow_out: self.bow_out_w = TT.fscalar("bow_out_w") if debug: self.bow_out_w.tag.test_value = np.float32(1.0) else: self.bow_out_w = 0 def __convert_inps_to_list(self): if isinstance(self.inps, list): X = self.inps[0] y = self.inps[1] if self.use_mask: mask = self.inps[2] cmask = None inps = [X, y] if self.use_mask: inps += [mask] if self.use_cost_mask: cmask = self.inps[3] inps += [cmask] if self.correlation_ws or self.use_qmask: self.qmask = self.inps[5] inps += [self.qmask] if self.predict_bow_out: bow_out = self.inps[4] inps += [bow_out] self.inps = inps else: X = self.inps['X'] y = self.inps['y'] mask = self.inps['mask'] cmask = None inps = [X, y] if self.use_mask: inps += [mask] if self.use_cost_mask: cmask = self.inps['cmask'] inps += [cmask] if self.correlation_ws or self.use_qmask: self.qmask = self.inps['qmask'] inps += [self.qmask] if self.predict_bow_out: bow_out = self.inps['bow_out'] inps += [bow_out] self.inps = inps def reset(self): self.params = Parameters() if self.w2v_embed_path and (self.use_bow_input or self.use_gru_inp_rep): self.w2v_embeds = pkl.load(open(self.w2v_embed_path, "rb")) if self.glove_embed_path: logger.info("Loading the GLOVE embeddings...") self.glove_embeds = pkl.load(open(self.glove_embed_path, "rb")) self.reg = 0 self.ntm = None self.merge_layer = None self.out_layer = None self.bow_layer = None self.baseline_out = None self.bow_pred_out = None self.gru_fact_layer_inps = None self.gru_fact_layer = None self.rnn_fact_layer_inps = None self.rnn_fact_layer = None self.bow_out_layer = None self.inp_proj_layer = None self.batch_norm_layer = None self.children = [] self.trainpartitioner = None self.known_grads = OrderedDict({}) self.updates = OrderedDict({}) def __init_to_embeds(self, layer, params, embeds, scale=0.42): logger.info("Initializing to word2vec embeddings.") if not isinstance(params, list): params = [params] for pp in params: pv = pp.get_value() for i, v in embeds.items(): pv[i] = scale * v layer.params[pp.name] = pv def __init_glove_embeds(self, layer, params, embeds): logger.info("Initializing to GLOVE embeddings.") if not isinstance(params, list): params = [params] glove_embs = self.emb_scale * embeds.astype("float32") mean = glove_embs.mean() std = glove_embs.std() token_embs = np.random.normal(loc=mean, scale=std, size=(2, 300)) token_embs = np.concatenate([token_embs, glove_embs], axis=0) for pp in params: self.exclude_params[pp.name] = 1 layer.params[pp.name] = token_embs.astype( "float32") #, name=pp.name) def build_model(self, use_noise=False, mdl_name=None): if self.use_ff_controller: cls = NTMFFController else: cls = NTM if use_noise: mem_gater_activ = lambda x: self.mem_gater_activ( x, use_noise=use_noise) if self.use_bow_input and not self.bow_layer and not self.use_gru_inp_rep: self.bow_layer = BOWLayer( n_in=self.n_in, n_out=self.bow_size, seq_len=self.max_fact_len, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("bow_layer")) if self.w2v_embed_path: fparams = self.bow_layer.params.lfilterby("weight") self.__init_to_embeds(self.bow_layer, fparams, self.w2v_embeds, scale=self.w2v_embed_scale) elif self.use_gru_inp_rep: if not self.gru_fact_layer_inps: low_cnames = [ "low_reset_below", "low_gater_below", "low_state_below" ] lnfout = len(low_cnames) self.low_cnames = map(lambda x: self.pname(x), low_cnames) self.gru_fact_layer_inps = ForkLayer( n_in=self.n_in, n_outs=tuple([self.bow_size for i in xrange(lnfout)]), weight_initializer=self.weight_initializer, use_bias=False, names=self.low_cnames) if self.w2v_embed_path: fparams = self.gru_fact_layer_inps.params.lfilterby( "weight") self.__init_to_embeds(self.gru_fact_layer_inps, fparams, self.w2v_embeds) if not self.gru_fact_layer: self.gru_fact_layer = GRULayer( n_in=self.bow_size, n_out=self.bow_size, seq_len=self.max_fact_len, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, activ=Tanh, learn_init_state=self.learn_h0, name=self.pname("gru_fact_layer")) elif self.use_simple_rnn_inp_rep: if not self.rnn_fact_layer_inps: self.rnn_fact_layer_inps = AffineLayer( n_in=self.n_in, n_out=self.bow_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("rnn_fact_layer_inps")) if self.w2v_embed_path: fparams = self.rnn_fact_layer_inps.params.lfilterby( "weight") self.__init_to_embeds(self.rnn_fact_layer_inps, fparams, self.w2v_embeds) if not self.rnn_fact_layer: self.rnn_fact_layer = RNNLayer( n_in=self.n_in, n_out=self.bow_size, seq_len=self.max_fact_len, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, activ=Rect, learn_init_state=self.learn_h0, name=self.pname("rnn_fact_layer")) else: if not self.inp_proj_layer: self.inp_proj_layer = AffineLayer( n_in=self.n_in, n_out=self.bow_size, weight_initializer=self.weight_initializer, use_bias=False, bias_initializer=self.bias_initializer, name=self.pname("ntm_inp_proj_layer")) if self.glove_embed_path: fparams = self.inp_proj_layer.params.lfilterby("weight") self.__init_glove_embeds(self.inp_proj_layer, fparams, self.glove_embeds) if self.predict_bow_out and not self.bow_out_layer: self.bow_out_layer = AffineLayer( n_in=self.n_hids, n_out=self.n_out, weight_initializer=self.weight_initializer, noise=self.noise, wpenalty=self.wpenalty, bias_initializer=self.bias_initializer, name=self.pname("bow_out_layer")) if self.use_batch_norm and not self.batch_norm_layer: self.batch_norm_layer = BatchNormLayer( n_in=self.bow_size, n_out=self.bow_size, name=self.pname("batch_norm_inp")) if not self.ntm: inp = self.inps[0] bs = inp.shape[1] if inp.ndim == 4: bs = inp.shape[2] self.ntm = cls( n_in=self.bow_size, n_hids=self.n_hids, l1_pen=self.l1_pen, learn_h0=self.learn_h0, hybrid_att=self.hybrid_att, smoothed_diff_weights=self.smoothed_diff_weights, use_layer_norm=self.use_layer_norm, recurrent_dropout_prob=self.recurrent_dropout_prob, use_bow_input=self.use_bow_input, use_loc_based_addressing=self.use_loc_based_addressing, use_reinforce=self.use_reinforce, erase_activ=self.erase_activ, content_activ=self.content_activ, mem_nel=self.mem_nel, address_size=self.address_size, use_context=self.use_context, n_read_heads=self.n_read_heads, use_soft_att=self.use_soft_att, use_hard_att_eval=self.use_hard_att_eval, use_inp_content=self.use_inp_content, n_write_heads=self.n_write_heads, dice_val=self.dice_val, mem_size=self.mem_size, use_nogru_mem2q=self.use_nogru_mem2q, use_gru_inp_rep=self.use_gru_inp_rep, weight_initializer=self.weight_initializer, use_adv_indexing=self.use_adv_indexing, wpenalty=self.wpenalty, noise=self.noise, n_layers=self.n_layers, bias_initializer=self.bias_initializer, use_quad_interactions=self.use_gate_quad_interactions, controller_activ=self.controller_activ, mem_gater_activ=self.mem_gater_activ, batch_size=self.batch_size if self.batch_size else None, use_multiscale_shifts=self.use_multiscale_shifts, n_reading_steps=self.n_reading_steps, seq_len=self.seq_len, name=self.pname("ntm"), use_noise=use_noise) if not self.merge_layer and self.use_deepout: self.merge_layer = MergeLayer( n_ins=[self.n_hids, self.mem_size], n_out=self.deep_out_size, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, names=[self.pname("deep_controller"), self.pname("deep_mem")]) if self.use_deepout: out_layer_in = self.deep_out_size else: out_layer_in = self.n_hids if self.use_out_mem: self.out_mem = AffineLayer( n_in=self.mem_size + self.address_size, n_out=self.n_out, weight_initializer=self.weight_initializer, wpenalty=self.wpenalty, noise=self.noise, bias_initializer=self.bias_initializer, name=self.pname("out_mem")) self.out_scaler = AffineLayer( n_in=self.n_hids, n_out=1, weight_initializer=self.weight_initializer, wpenalty=self.wpenalty, noise=self.noise, bias_initializer=self.bias_initializer, name=self.pname("out_scaler")) if not self.out_layer: self.out_layer = AffineLayer( n_in=out_layer_in, n_out=self.n_out, wpenalty=self.wpenalty, noise=self.noise, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("out")) if self.ntm.updates: self.updates.update(self.ntm.updates) if not self.use_reinforce_baseline and self.use_reinforce: self.baseline_out = AffineLayer( n_in=self.n_hids, n_out=1, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, init_bias_val=1e-3, name=self.pname("baseline_out")) if not self.children: self.children.append(self.ntm) if self.use_deepout and self.merge_layer: self.children.append(self.merge_layer) self.children.append(self.out_layer) if self.use_out_mem: self.children.extend([self.out_mem, self.out_scaler]) if self.use_bow_input and self.bow_layer and not self.use_gru_inp_rep: self.children.append(self.bow_layer) elif self.use_gru_inp_rep: self.children.extend( [self.gru_fact_layer_inps, self.gru_fact_layer]) elif self.use_simple_rnn_inp_rep: self.children.extend( [self.rnn_fact_layer_inps, self.rnn_fact_layer]) else: self.children.append(self.inp_proj_layer) if self.predict_bow_out and self.bow_out_layer: self.children.append(self.bow_out_layer) if self.use_reinforce and not self.use_reinforce_baseline: self.children.append(self.baseline_out) if self.use_batch_norm: self.children.append(self.batch_norm_layer) self.merge_params() if self.renormalization_scale: self.params.renormalize_params( nscale=self.renormalization_scale, exclude_params=self.exclude_params) if mdl_name: if os.path.exists(mdl_name): logger.info("Reloading model from %s." % mdl_name) self.params.load(mdl_name) [child.use_params(self.params) for child in self.children] else: warnings.warn( "The model file does not exist and could not load it.") if self.trainpartitioner is None and self.sub_mb_size: self.trainpartitioner = MinibatchGradPartitioner( self.params, self.sub_mb_size, self.batch_size, seq_len=self.seq_len) def get_cost(self, use_noise=False, valid_only=False, mdl_name=None): probs, _ = self.fprop(use_noise=use_noise, mdl_name=mdl_name) if isinstance(self.inps, list): X = self.inps[0] y = self.inps[1] if self.use_mask: mask = self.inps[2] cmask = None if self.use_cost_mask: cmask = self.inps[3] else: X = self.inps['x'] y = self.inps['y'] mask = self.inps['mask'] cmask = None if self.use_cost_mask: cmask = self.inps['cmask'] if self.l1_pen and self.l1_pen > 0 and not valid_only: self.reg += self.ntm.reg if self.l2_pen and not valid_only: self.l2_penalizer.penalize_layer_weights(self.out_layer) self.l2_penalizer.penalize_params( self.ntm.params.filterby("init_state").values[0]) self.l2_penalizer.penalize_params( self.ntm.controller.params.filterby("weight").values[0]) if not self.use_ff_controller: self.l2_penalizer.penalize_params( self.ntm.controller.params.filterby( "state_before_ht").values[0]) self.reg += self.l2_penalizer.get_penalty_level() if not self.softmax: self.cost = kl(y, probs, cost_mask=cmask) self.errors = 0 else: if not self.use_last_hidden_state: self.cost, self.errors = nll(y, probs, cost_mask=cmask) else: self.cost, self.errors = nll(y, probs) if self.cost.ndim == 2: self.cost_mon = self.cost.sum(0).mean() if valid_only: self.cost = self.cost_mon else: self.cost_mon = self.cost.mean() if valid_only: self.cost = self.cost_mon bow_cost = 0 if not valid_only: bow_cost_shifted = 0 if self.predict_bow_out and self.bow_pred_out and self.bow_out_layer: bow_target = self.inps[-1] bcmask = mask * TT.cast(TT.eq(cmask, 0), "float32") sum_tru_time = False cost_matrix = True if self.use_reinforce and \ not sum_tru_time else False batch_vec = True if self.use_reinforce else False bow_cost = self.bow_out_w * kl(bow_target, self.bow_pred_out, batch_vec=batch_vec, sum_tru_time=sum_tru_time, cost_matrix=cost_matrix, cost_mask=bcmask, normalize_by_outsize=True) if cost_matrix: bow_cost_shifted = TT.zeros_like(bow_cost) bow_cost_shifted = TT.set_subtensor(bow_cost_shifted[1:], \ bow_cost[:-1]) else: bow_cost_shifted = bow_cost self.center = 0 self.cost_std = 1 if self.use_reinforce and self.use_reinforce_baseline: self.cost_mon = self.cost if not self.use_mask: mask = None self.updates, self.known_grads, self.baseline, cost_std, \ self.write_policy, maxent_level = self.Reinforce(probs=self.write_weights, samples=self.w_samples, updates=self.updates, cost=(1 - self.bow_out_w) * self.cost + bow_cost_shifted, mask=mask) maxent_level = self.lambda2_reinf elif self.use_reinforce: if "float" in X.dtype: self.baseline = self.baseline_out.fprop( self.ntm_outs[0]).reshape( (X.shape[0], X.shape[1])).dimshuffle(0, 1, 'x') else: self.baseline = self.baseline_out.fprop( self.ntm_outs[0]).reshape((X.shape[1], X.shape[2], -1)) mask_ = None mask = None if self.use_mask: if mask: mask_ = mask if mask.ndim == 2: mask_ = mask.dimshuffle(0, 1, 'x') self.baseline = mask_ * self.baseline if not self.softmax: self.cost = kl(y, probs, cost_mask=cmask, cost_matrix=True) self.errors = 0 else: self.cost, self.errors = nll(y, probs, cost_mask=cmask, cost_matrix=True) self.updates, self.known_grads, self.center, self.cost_std, \ self.write_policy, maxent_level = \ self.Reinforce(probs=self.write_weights, samples=self.w_samples, baseline=self.baseline, updates=self.updates, cost=(1 - self.bow_out_w) * self.cost + \ bow_cost_shifted, mask=mask) if self.cost.ndim == 2: hcost = self.cost.sum(0).dimshuffle('x', 0, 'x') else: hcost = self.cost.dimshuffle(0, 'x', 'x') base_reg = huber_loss(y_hat=self.baseline, target=block_gradient(hcost), center=block_gradient(self.center), std=block_gradient(self.cost_std)) if self.cost.ndim == 2: self.cost_mon = self.cost.sum(0).mean() else: self.cost_mon = self.cost.mean() if mask_: base_reg = mask_ * base_reg self.base_reg = self.baseline_reg * base_reg.sum(0).mean() self.reg += self.base_reg if self.use_reinforce: self.ReaderReinforce.maxent_level = maxent_level self.read_constraint, self.read_policy = \ self.ReaderReinforce(baseline=self.baseline, cost=self.cost + bow_cost, probs=self.read_weights, samples=self.r_samples, mask=mask, center=self.center, cost_std=self.cost_std) if self.cost.ndim == 2: self.cost = self.cost.sum(0).mean() else: self.cost = self.cost.mean() if bow_cost != 0 and bow_cost.ndim >= 1 and bow_cost != 0: bow_cost = bow_cost.sum(0).mean() if self.predict_bow_out and bow_cost: self.cost = (1 - self.bow_out_w) * self.cost + bow_cost if self.use_reinforce and self.read_constraint: self.cost += self.read_constraint if self.reg: self.cost += self.reg return self.cost, self.errors, bow_cost def get_inspect_fn(self, mdl_name=None): logger.info("Compiling inspect function.") probs, ntm_outs = self.fprop(use_noise=False, mdl_name=mdl_name) updates = OrderedDict({}) if self.ntm.updates and self.use_reinforce: updates.update(self.ntm.updates) inspect_fn = theano.function( [self.inps[0], self.inps[2], self.inps[3], self.seq_len], ntm_outs + [probs], updates=self.ntm.updates, name=self.pname("inspect_fn")) return inspect_fn def get_valid_fn(self, mdl_name=None): logger.info("Compiling validation function.") if self.predict_bow_out or self.bow_out_layer: if self.inps[-1].name == "bow_out": inps = self.inps[:-1] else: inps = self.inps if self.softmax: cost, errors, _ = self.get_cost(use_noise=True, valid_only=True, mdl_name=mdl_name) if self.ntm.updates: self.updates.update(self.ntm.updates) valid_fn = theano.function(inps + [self.seq_len], [cost, errors], updates=self.ntm.updates, on_unused_input='warn', name=self.pname("valid_fn")) else: cost, _, _ = self.get_cost(use_noise=False, mdl_name=mdl_name) if self.ntm.updates: self.updates.update(self.ntm.updates) valid_fn = theano.function(inps + [self.seq_len], [cost], updates=self.ntm.updates, on_unused_input='warn', name=self.pname("valid_fn")) return valid_fn def add_noise_to_params(self): for k, v in self.params.__dict__['params'].iteritems(): v_np = v.get_value(borrow=True) noise = global_rng.normal(0, 0.05, v_np.shape) self.params[k] = v_np + noise def get_train_fn(self, lr=None, mdl_name=None): if lr is None: lr = self.eps if self.softmax: cost, errors, bow_cost = self.get_cost(use_noise=True, mdl_name=mdl_name) else: cost, _, _ = self.get_cost(use_noise=True, mdl_name=mdl_name) params = self.params.values logger.info("Computing the gradients.") self.grads_timer.start() inps = self.inps if self.predict_bow_out: inps = self.inps + [self.bow_out_w] if not self.learn_embeds: params.pop(0) grads = safe_grad(cost, params, known_grads=self.known_grads) self.grads_timer.stop() logger.info(self.grads_timer) logger.info("Compiling grad fn.") self.gradfn_timer.start() if self.sub_mb_size: if self.sub_mb_size != self.batch_size: self.comp_grad_fn, grads = self.trainpartitioner.get_compute_grad_fn( grads, self.ntm.updates, inps) gnorm = sum(grad.norm(2) for _, grad in grads.iteritems()) updates, norm_up, param_norm = self.learning_rule.get_updates( learning_rate=lr, grads=grads) self.gradfn_timer.stop() logger.info(self.gradfn_timer) if self.updates: self.updates.update(updates) else: self.updates = updates warnings.warn("WARNING: Updates are empty.") logger.info("Compiling the training function.") self.train_timer.start() if hasattr(self, "cost_mon"): outs = [self.cost_mon, gnorm, norm_up, param_norm] else: outs = [cost, gnorm, norm_up, param_norm] if self.softmax: outs += [self.errors] if self.predict_bow_out: outs += [bow_cost] if self.use_reinforce: outs += [self.read_constraint, self.baseline, self.read_policy, \ self.write_policy] if not self.use_reinforce_baseline: outs += [self.center, self.cost_std, self.base_reg] if self.use_batch_norm: self.updates.update(self.batch_norm_layer.updates) train_fn = theano.function(inps + [self.seq_len], outs, updates=self.updates, mode=self.theano_function_mode, name=self.pname("train_fn")) self.train_timer.stop() logger.info(self.train_timer) if self.train_profile: import sys sys.exit(-1) return train_fn def fprop(self, inps=None, leak_rate=0.05, use_noise=False, mdl_name=None): self.build_model(use_noise=use_noise, mdl_name=mdl_name) self.ntm.evaluation_mode = use_noise if not inps: inps = self.inps # First two are X and targets # assert (2 + sum([use_mask, use_cmask])) + 1 >= len(inps), \ # "inputs have illegal shape." cmask = None mask = None if isinstance(inps, list): X = inps[0] y = inps[1] if self.use_mask: mask = inps[2] if self.use_cost_mask: cmask = inps[3] else: X = inps['X'] y = inps['y'] if self.use_mask: mask = inps['mask'] if self.use_cost_mask: cmask = inps['cmask'] if self.use_cost_mask: if cmask is not None: if self.use_bow_cost_mask: if mask.ndim == cmask.ndim: m = (mask * TT.eq(cmask, 0)).reshape( (cmask.shape[0] * cmask.shape[1], -1)) else: m = (mask.dimshuffle(0, 1, 'x') * TT.eq(cmask, 0))[:, :, 0].reshape( (cmask.shape[0] * cmask.shape[1], -1)) else: m = mask else: raise ValueError("Mask for the answers should not be empty.") if X.ndim == 2 and y.ndim == 1: # For sequential MNIST. if self.permute_order: X = X.dimshuffle(1, 0) idxs = self.rnd_indxs X = X[idxs] inp_shp = (X.shape[0], X.shape[1], -1) else: inp_shp = (X.shape[1], X.shape[2], -1) #import pdb;pdb.set_trace() self.ntm_in = None if self.use_bow_input and not self.use_gru_inp_rep and not self.use_simple_rnn_inp_rep: bow_out = self.bow_layer.fprop(X, amask=m, deterministic=not use_noise) bow_out = bow_out.reshape((X.shape[1], X.shape[2], -1)) self.ntm_in = bow_out elif self.use_gru_inp_rep: m0 = as_floatX(TT.gt(X, 0)) if self.use_mask and self.use_cost_mask: if cmask is not None: m1 = mask * TT.eq(cmask, 0) else: raise ValueError( "Mask for the answers should not be empty.") low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1) Xr = X.reshape(low_inp_shp) grufact_inps = self.gru_fact_layer_inps.fprop(Xr) low_reset_below = grufact_inps.values()[0].reshape(low_inp_shp) low_gater_below = grufact_inps.values()[1].reshape(low_inp_shp) low_state_below = grufact_inps.values()[2].reshape(low_inp_shp) linps = [low_reset_below, low_gater_below, low_state_below] m0_part = TT.cast( m0.sum(0).reshape( (X.shape[1], X.shape[2])).dimshuffle(0, 1, 'x'), 'float32') m0_part = TT.switch(TT.eq(m0_part, as_floatX(0)), as_floatX(1), m0_part) h0 = self.gru_fact_layer.fprop(inps=linps, mask=m0, batch_size=self.batch_size) self.ntm_in = m1.dimshuffle(0, 1, 'x') * ((m0.dimshuffle(0, 1, 2, 'x') * h0.reshape((X.shape[0], X.shape[1], X.shape[2], -1))).sum(0) \ / m0_part).reshape(inp_shp) elif self.use_simple_rnn_inp_rep: m0 = as_floatX(TT.gt(X, 0)) if cmask is not None: m1 = mask * TT.eq(cmask, 0) else: raise ValueError("Mask for the answers should not be empty.") low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1) Xr = X.reshape(low_inp_shp) rnnfact_inps = self.rnn_fact_layer_inps.fprop(Xr).reshape( low_inp_shp) m0 = m0.reshape(low_inp_shp) h0 = self.rnn_fact_layer.fprop(inps=rnnfact_inps, mask=m0, batch_size=self.batch_size) m0_part = TT.cast( m0.sum(0).reshape( (X.shape[1], X.shape[2])).dimshuffle(0, 1, 'x'), 'float32') m0_part = TT.switch(m0_part == 0, as_floatX(1), m0_part) self.ntm_in = m1.dimshuffle(0, 1, 'x') * (h0.reshape((X.shape[0], X.shape[1], X.shape[2], -1)).sum(0) / \ m0_part).reshape(inp_shp) else: X_proj = self.inp_proj_layer.fprop(X) if not self.learn_embeds: X_proj = block_gradient(X_proj) if self.use_batch_norm: X_proj = self.batch_norm_layer.fprop(X_proj, inference=not use_noise) self.ntm_in = X_proj context = None if self.use_context: if self.use_qmask: context = (self.qmask.dimshuffle(0, 1, 'x') * self.ntm_in).sum(0) else: m1_part = m1.sum(0).dimshuffle(0, 'x') context = self.ntm_in.sum(0) / m1_part self.ntm_outs = self.ntm.fprop(self.ntm_in, mask=mask, cmask=cmask, context=context, batch_size=self.batch_size, use_mask=self.use_mask, use_noise=not use_noise) h, m_read = self.ntm_outs[0], self.ntm_outs[2] if self.use_reinforce: self.w_samples, self.r_samples = self.ntm_outs[-2], self.ntm_outs[ -1] if self.smoothed_diff_weights: idx = -6 else: idx = -4 self.write_weights, self.read_weights = self.ntm_outs[idx], \ self.ntm_outs[idx+1] else: self.write_weights, self.read_weights = self.ntm_outs[ 3], self.ntm_outs[4] if self.anticorrelation: acorr = AntiCorrelationConstraint(level=self.anticorrelation) rw1 = self.read_weights[:, 0] rw2 = self.read_weights[:, 1] self.reg += acorr(rw1, rw2, mask=mask) if self.correlation_ws: logger.info("Applying the correlation constraint.") corr_cons = CorrelationConstraint(level=self.correlation_ws) self.reg += corr_cons(self.read_weights, self.write_weights, mask, self.qmask) if self.use_last_hidden_state: h = h.reshape(inp_shp) h = h[-1] if self.use_deepout: merged_out = self.merge_layer.fprop([h, m_read]) out_layer = Leaky_Rect(merged_out, leak_rate) if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) out_layer = dropOp(out_layer, deterministic=not use_noise) out_layer = self.out_layer.fprop(out_layer, deterministic=not use_noise) else: if self.use_out_mem: if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) m_read = dropOp(m_read, deterministic=not use_noise) mem_out = self.out_mem.fprop(m_read, deterministic=not use_noise) mem_scaler = self.out_scaler.fprop( h, deterministic=not use_noise).reshape( (mem_out.shape[0], )).dimshuffle(0, 'x') h_out = self.out_layer.fprop(h, deterministic=not use_noise) out_layer = h_out + mem_out * Sigmoid(mem_scaler) else: if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) h = dropOp(h, deterministic=not use_noise) out_layer = self.out_layer.fprop(h, deterministic=not use_noise) if self.predict_bow_out and self.bow_out_layer: logger.info("Using the bow output prediction.") self.bow_pred_out = Sigmoid( self.bow_out_layer.fprop(h, deterministic=not use_noise)) if self.softmax: self.probs = Softmax(out_layer) else: self.probs = Sigmoid(out_layer) if self.ntm.updates: self.updates.update(self.ntm.updates) self.str_params(logger) self.h = h return self.probs, self.ntm_outs def __get_state__(self): return self.state def __set_state__(self, state): self.__dict__.update(state)
def build_model(self, use_noise=False, mdl_name=None): if not self.bow_layer: self.bow_layer = BOWLayer( n_in=self.n_in, n_out=self.bow_size, seq_len=12, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("bow_layer")) if self.deepout: self.deepout_layer_qbow = AffineLayer( n_in=self.bow_size, n_out=self.deepout, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("deepout_qbow")) self.deepout_layer_ht = AffineLayer( n_in=self.n_hids, n_out=self.deepout, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("deepout_ht")) self.out_layer_in = self.deepout if not self.bowup_layer: cnames = ["forget_below", "input_below", "out_below", "cell_below"] nfout = len(cnames) self.cnames = map(lambda x: self.pname(x), cnames) self.bowup_layer = ForkLayer( n_in=self.bow_size, n_outs=tuple([self.n_hids for i in xrange(nfout)]), weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, names=self.cnames) if not self.lstm_layer: self.lstm_layer = LSTMLayer( n_in=self.n_hids, n_out=self.n_hids, seq_len=self.seq_len, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, activ=self.activ, learn_init_state=self.learn_h0, name=self.pname("lstm_layer")) if not self.out_layer: self.out_layer = AffineLayer( n_in=self.out_layer_in, n_out=self.n_out, noise=self.noise, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("ntm_out")) if not self.children: self.children.append(self.bowup_layer) self.children.append(self.bow_layer) self.children.append(self.lstm_layer) self.children.append(self.out_layer) if self.deepout: self.children.append(self.deepout_layer_qbow) self.children.append(self.deepout_layer_ht) self.merge_params() if mdl_name: logger.info("Reloading the model from %s. " % mdl_name) self.params.load(mdl_name) [child.use_params(self.params) for child in self.children]
class LSTMModel(Model): def __init__(self, n_in, n_hids, bow_size, n_out, inps=None, dropout=None, seq_len=None, learning_rule=None, weight_initializer=None, bias_initializer=None, learn_h0=False, deepout=None, activ=None, use_cost_mask=True, noise=False, use_hint_layer=False, use_average=False, theano_function_mode=None, use_positional_encoding=False, use_inv_cost_mask=False, batch_size=32, use_noise=False, name=None): self.n_in = n_in self.n_hids = n_hids self.n_out = n_out self.bow_size = bow_size self.inps = inps self.noise = noise self.seq_len = seq_len self.dropout = dropout self.use_cost_mask = use_cost_mask self.learning_rule = learning_rule self.bias_initializer = bias_initializer self.learn_h0 = learn_h0 self.use_average = use_average self.deepout = deepout self.batch_size = batch_size self.use_noise = use_noise self.train_timer = Timer("Training function") self.grads_timer = Timer("Computing the grads") self.theano_function_mode = theano_function_mode self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.use_average = use_average self.use_positional_encoding = use_positional_encoding self.use_inv_cost_mask = use_inv_cost_mask self.eps = 1e-8 self.activ = activ self.out_layer_in = self.n_hids if bow_size is None: raise ValueError("bow_size should be specified.") if name is None: raise ValueError("name should not be empty.") self.reset() self.name = name def reset(self): self.children = [] self.params = Parameters() self.bow_layer = None self.lstm_layer = None self.out_layer = None self.bowup_layer = None self.hint_layer = None self.updates = OrderedDict({}) def build_model(self, use_noise=False, mdl_name=None): if not self.bow_layer: self.bow_layer = BOWLayer( n_in=self.n_in, n_out=self.bow_size, seq_len=12, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("bow_layer")) if self.deepout: self.deepout_layer_qbow = AffineLayer( n_in=self.bow_size, n_out=self.deepout, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("deepout_qbow")) self.deepout_layer_ht = AffineLayer( n_in=self.n_hids, n_out=self.deepout, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("deepout_ht")) self.out_layer_in = self.deepout if not self.bowup_layer: cnames = ["forget_below", "input_below", "out_below", "cell_below"] nfout = len(cnames) self.cnames = map(lambda x: self.pname(x), cnames) self.bowup_layer = ForkLayer( n_in=self.bow_size, n_outs=tuple([self.n_hids for i in xrange(nfout)]), weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, names=self.cnames) if not self.lstm_layer: self.lstm_layer = LSTMLayer( n_in=self.n_hids, n_out=self.n_hids, seq_len=self.seq_len, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, activ=self.activ, learn_init_state=self.learn_h0, name=self.pname("lstm_layer")) if not self.out_layer: self.out_layer = AffineLayer( n_in=self.out_layer_in, n_out=self.n_out, noise=self.noise, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("ntm_out")) if not self.children: self.children.append(self.bowup_layer) self.children.append(self.bow_layer) self.children.append(self.lstm_layer) self.children.append(self.out_layer) if self.deepout: self.children.append(self.deepout_layer_qbow) self.children.append(self.deepout_layer_ht) self.merge_params() if mdl_name: logger.info("Reloading the model from %s. " % mdl_name) self.params.load(mdl_name) [child.use_params(self.params) for child in self.children] def get_cost(self, use_noise=False, mdl_name=None): probs, _ = self.fprop(use_noise=use_noise, mdl_name=mdl_name) y = self.inps[1] cmask = None if self.use_cost_mask: cmask = self.inps[3] self.cost, self.errors = nll(y, probs, cost_mask=cmask) return self.cost, self.errors def get_train_fn(self, lr=None, mdl_name=None): if lr is None: lr = self.eps cost, errors = self.get_cost(use_noise=self.use_noise, mdl_name=mdl_name) params = self.params.values logger.info("Computing the gradient graph.") self.grads_timer.start() grads = safe_grad(cost, params) gnorm = sum(grad.norm(2) for _, grad in grads.iteritems()) updates, norm_up, param_norm = self.learning_rule.get_updates( learning_rate=lr, grads=grads) self.grads_timer.stop() logger.info(self.grads_timer) if not self.updates: self.updates = self.updates.update(updates) outs = [self.cost, gnorm, norm_up, param_norm] outs += [self.errors] train_fn = theano.function(self.inps, outs, updates=updates, mode=self.theano_function_mode, name=self.pname("train_fn")) self.train_timer.stop() logger.info(self.train_timer) return train_fn def get_inspect_fn(self, mdl_name=None): logger.info("Compiling inspect function.") probs, h = self.fprop(use_noise=False, mdl_name=mdl_name) inspect_fn = theano.function([self.inps[0], self.inps[2]], [h, probs], name=self.pname("inspect_fn")) return inspect_fn def get_valid_fn(self, mdl_name=None): logger.info("Compiling validation function.") self.cost, self.errors = self.get_cost(use_noise=False, mdl_name=mdl_name) valid_fn = theano.function(self.inps, [self.cost, self.errors], name=self.pname("valid_fn")) return valid_fn def fprop(self, inps=None, use_mask=True, use_cmask=True, use_noise=False, mdl_name=None): self.build_model(use_noise=use_noise, mdl_name=mdl_name) if not inps: inps = self.inps X = inps[0] if use_mask: mask = inps[2] if use_cmask: cmask = inps[3] qmask = inps[4] assert (3 + sum([use_mask, use_cmask ])) == len(inps), "inputs have illegal shape." if cmask is not None: m = mask * TT.eq(cmask.reshape( (cmask.shape[0], cmask.shape[1])), 0) else: raise ValueError("Mask for the answers should not be empty.") bow_out = self.bow_layer.fprop(X, amask=m, qmask=qmask, deterministic=not use_noise) new_bow = TT.roll(bow_out, 1, axis=0) new_bow = TT.set_subtensor(new_bow[0], as_floatX(0)) bow_outs = self.bowup_layer.fprop(bow_out, deterministic=not use_noise) forget_below = bow_outs[self.cnames[0]].reshape( (X.shape[1], X.shape[2], -1)) input_below = bow_outs[self.cnames[1]].reshape( (X.shape[1], X.shape[2], -1)) output_below = bow_outs[self.cnames[2]].reshape( (X.shape[1], X.shape[2], -1)) cell_below = bow_outs[self.cnames[3]].reshape( (X.shape[1], X.shape[2], -1)) inps = [forget_below, input_below, output_below, cell_below] h, c = self.lstm_layer.fprop(inps=inps, mask=mask, batch_size=self.batch_size) if self.deepout: h_deepout = self.deepout_layer_ht.fprop(h) emb_deepout = self.deepout_layer_qbow.fprop(new_bow) z = Leaky_Rect(h_deepout + emb_deepout, 0.01) if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) z = dropOp(z, deterministic=not use_noise) else: z = h if self.dropout: dropOp = Dropout(dropout_prob=self.dropout) z = dropOp(z, deterministic=not use_noise) out_layer = self.out_layer.fprop(z, deterministic=not use_noise) self.probs = Softmax(out_layer) return self.probs, h
def init_params(self): self.state_gater_before_proj = AffineLayer(n_in=self.n_hids, n_out=self.n_hids, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("statebf_gater")) self.state_reset_before_proj = AffineLayer(n_in=self.n_hids, n_out=self.n_hids, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("statebf_reset")) self.state_mem_before_proj = AffineLayer(n_in=self.mem_size, n_out=self.n_hids, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=True, name=self.pname("membf_ht")) self.state_str_before_proj = AffineLayer(n_in=self.n_hids, n_out=self.n_hids, use_bias=False, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("state_before_ht")) self.children = [self.state_gater_before_proj, self.state_reset_before_proj, self.state_mem_before_proj, self.state_str_before_proj] if self.use_layer_norm: logger.info("Applying layer norm on the layers...") self.reset_layer_norm_inp = LayerNormLayer(n_out=self.n_hids, name=self.pname("reset_lnorm_inp")) self.update_layer_norm_inp = LayerNormLayer(n_out=self.n_hids, name=self.pname("update_lnorm_inp")) self.ht_layer_norm_inp = LayerNormLayer(n_out=self.n_hids, name=self.pname("ht_lnorm_inp")) self.reset_layer_norm_bf = LayerNormLayer(n_out=self.n_hids, name=self.pname("reset_lnorm_bf")) self.update_layer_norm_bf = LayerNormLayer(n_out=self.n_hids, name=self.pname("update_lnorm_bf")) self.ht_layer_norm_bf = LayerNormLayer(n_out=self.n_hids, name=self.pname("ht_lnorm_bf")) self.mem_layer_norm_bf = LayerNormLayer(n_out=self.n_hids, name=self.pname("mem_lnorm_bf")) self.children += [self.reset_layer_norm_inp, self.update_layer_norm_inp, self.ht_layer_norm_inp, self.reset_layer_norm_bf, self.update_layer_norm_bf, self.ht_layer_norm_bf, self.mem_layer_norm_bf] self.merge_params() self.str_params()
class Controller(Layer): """ A Writer Layer. """ def __init__(self, n_hids=None, mem_size=None, weight_initializer=None, bias_initializer=None, recurrent_dropout_prob=-1, use_layer_norm=False, activ=None, name="ntm_controller"): if isinstance(activ, str) and activ is not None: self.activ = eval(activ) elif activ is not None: self.activ = activ else: self.activ = Tanh super(Controller, self).__init__() self.n_hids = n_hids self.mem_size = mem_size self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.name = name self.recurrent_dropout_prob = recurrent_dropout_prob self.dropout = None if recurrent_dropout_prob > 0. and recurrent_dropout_prob < 1: logger.info("Using the dropout in the recurrent layers...") self.dropout = Dropout(dropout_prob=self.recurrent_dropout_prob) self.use_layer_norm = use_layer_norm self.init_params() def init_params(self): self.state_gater_before_proj = AffineLayer(n_in=self.n_hids, n_out=self.n_hids, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("statebf_gater")) self.state_reset_before_proj = AffineLayer(n_in=self.n_hids, n_out=self.n_hids, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("statebf_reset")) self.state_mem_before_proj = AffineLayer(n_in=self.mem_size, n_out=self.n_hids, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_bias=True, name=self.pname("membf_ht")) self.state_str_before_proj = AffineLayer(n_in=self.n_hids, n_out=self.n_hids, use_bias=False, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("state_before_ht")) self.children = [self.state_gater_before_proj, self.state_reset_before_proj, self.state_mem_before_proj, self.state_str_before_proj] if self.use_layer_norm: logger.info("Applying layer norm on the layers...") self.reset_layer_norm_inp = LayerNormLayer(n_out=self.n_hids, name=self.pname("reset_lnorm_inp")) self.update_layer_norm_inp = LayerNormLayer(n_out=self.n_hids, name=self.pname("update_lnorm_inp")) self.ht_layer_norm_inp = LayerNormLayer(n_out=self.n_hids, name=self.pname("ht_lnorm_inp")) self.reset_layer_norm_bf = LayerNormLayer(n_out=self.n_hids, name=self.pname("reset_lnorm_bf")) self.update_layer_norm_bf = LayerNormLayer(n_out=self.n_hids, name=self.pname("update_lnorm_bf")) self.ht_layer_norm_bf = LayerNormLayer(n_out=self.n_hids, name=self.pname("ht_lnorm_bf")) self.mem_layer_norm_bf = LayerNormLayer(n_out=self.n_hids, name=self.pname("mem_lnorm_bf")) self.children += [self.reset_layer_norm_inp, self.update_layer_norm_inp, self.ht_layer_norm_inp, self.reset_layer_norm_bf, self.update_layer_norm_bf, self.ht_layer_norm_bf, self.mem_layer_norm_bf] self.merge_params() self.str_params() def fprop(self, state_before, mem_before, reset_below, gater_below, state_below, context=None, use_noise=None): state_reset = self.state_reset_before_proj.fprop(state_before) state_gater = self.state_gater_before_proj.fprop(state_before) membf_state = self.state_mem_before_proj.fprop(mem_before) if self.use_layer_norm: state_reset = self.reset_layer_norm_bf.fprop(state_reset) state_gater = self.update_layer_norm_bf.fprop(state_gater) membf_state = self.mem_layer_norm_bf.fprop(membf_state) reset_below = self.reset_layer_norm_inp.fprop(reset_below) gater_below = self.update_layer_norm_inp.fprop(gater_below) state_below = self.ht_layer_norm_inp.fprop(state_below) reset = Sigmoid(reset_below + state_reset) state_state = self.state_str_before_proj.fprop(reset * state_before) if self.use_layer_norm: state_state = self.ht_layer_norm_bf.fprop(state_state) gater = Sigmoid(gater_below + state_gater) if context: h = self.activ(state_state + membf_state + state_below + context) else: h = self.activ(state_state + membf_state + state_below) if self.dropout: h = self.dropout(h, use_noise=use_noise) h_t = (1. - gater) * state_before + gater * h return h_t
class FFController(Layer): """ A Writer Layer. """ def __init__(self, n_hids=None, mem_size=None, weight_initializer=None, bias_initializer=None, activ=None, noisy=False, n_layers=2, name="ntm_controller"): if isinstance(activ, str) and activ is not None: self.activ = eval(activ) elif activ is not None: self.activ = activ else: self.activ = Tanh super(FFController, self).__init__() print "Number of layers is, ", n_layers self.n_layers = n_layers self.n_hids = n_hids self.mem_size = mem_size self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.name = name self.use_noise = 1.0 self.noisy = noisy self.init_params() def init_params(self): self.additional_layers = [] if self.n_layers > 1: for i in xrange(1, self.n_layers): self.additional_layers += [ AffineLayer(n_in=self.n_hids, n_out=self.n_hids, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("ff_cont_proj_%d" % i))] if self.noisy: mpname = self.pname("ff_controller_p_vals") self.params[mpname] = np.random.uniform(-1.0, 1.0, (self.n_layers, self.n_hids)).astype("float32") self.pvals = self.params[mpname] self.mem_before_p = AffineLayer(n_in=self.mem_size, n_out=self.n_hids, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("mem_before_p")) self.children = [self.mem_before_p] + self.additional_layers self.merge_params() self.str_params() def fprop(self, state_below, mem_before=None, context=None): mem_before_p = 0. if mem_before: mem_before_p = self.mem_before_p.fprop(mem_before) if context: z_t = state_below + mem_before_p + context else: z_t = state_below + mem_before_p #import ipdb; ipdb.set_trace() if self.n_layers > 1: for i in xrange(1, self.n_layers): if self.noisy: z_t = NTanhP(z_t, self.pvals[i-1], use_noise=self.use_noise) else: z_t = self.activ(z_t) z_t = self.additional_layers[i-1].fprop(z_t) if self.noisy: h_t = NTanhP(z_t, self.pvals[self.n_layers - 1], use_noise=self.use_noise) else: h_t = self.activ(z_t) return h_t
def init_params(self, use_noise=False, mdl_name=None): if not hasattr(self, "children") or not self.children: self.children = [] self.inp_bow_layer = BOWLayer(n_in=self.n_in, n_out=self.bow_size, seq_len=self.max_fact_len, use_inv_cost_mask=False, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("bow_layer")) self.inp_bow_layers = [self.inp_bow_layer] self.out_bow_layer = BOWLayer(n_in=self.n_in, n_out=self.bow_size, seq_len=self.max_fact_len, use_inv_cost_mask=False, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("out_bow_layer")) self.out_bow_layers = [self.out_bow_layer] if not self.share_inp_out_weights: for i in xrange(1, self.n_steps): self.inp_bow_layers += [BOWLayer(n_in=self.n_in, n_out=self.bow_size, seq_len=self.max_fact_len, use_inv_cost_mask=False, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("bow_layer_" + str(i)))] self.out_bow_layers += [BOWLayer(n_in=self.n_in, n_out=self.bow_size, use_inv_cost_mask=False, seq_len=self.max_fact_len, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("out_bow_layer_" + str(i)))] self.q_embed = BOWLayer(n_in=self.n_in, n_out=self.bow_size, use_inv_cost_mask=False, seq_len=self.max_fact_len, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("q_embed")) self.out_layer = AffineLayer(n_in=self.bow_size, n_out=self.n_out, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("out_layer")) self.children.extend(self.inp_bow_layers) self.children.extend(self.out_bow_layers) self.children.append(self.out_layer) self.children.append(self.q_embed) self.merge_params() # These are the parameters for the temporal encoding thing: self.T_ins = [] self.T_outs = [] nsteps = 1 if self.share_inp_out_weights else self.n_steps #""" for i in xrange(nsteps): T_in = self.weight_initializer(self.max_seq_len, self.bow_size) self.params[self.pname("TE_in_%d" % i)] = T_in self.T_ins.append(self.params[self.pname("TE_in_%d" % i)]) T_out = self.weight_initializer(self.max_seq_len, self.bow_size) self.params[self.pname("TE_out_%d" % i)] = T_out self.T_outs.append(self.params[self.pname("TE_out_%d" % i)]) #""" if mdl_name: logger.info("Reloading model from %s." % mdl_name) self.params.load(mdl_name) [child.use_params(self.params) for child in self.children]
class WeaklySupervisedMemoryNet(Layer): """ An implementation of weakly supervised memory network paper. """ def __init__(self, n_in, n_out, bow_size, weight_initializer=None, use_index_jittering=False, bias_initializer=None, max_fact_len=12, max_seq_len=250, dropout=None, batch_size=None, learning_rule=None, share_inp_out_weights=False, n_steps=1, inps=None, use_noise=False, theano_function_mode=None, rng=None, name=None): self.n_in = n_in self.n_out = n_out self.bow_size = bow_size self.use_index_jittering = use_index_jittering self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.share_inp_out_weights = share_inp_out_weights self.rng = rng self.inps = inps self.dropout = dropout self.batch_size = batch_size self.learning_rule = learning_rule self.theano_function_mode = theano_function_mode self.eps = 1e-7 self.max_fact_len = max_fact_len self.max_seq_len = max_seq_len self.n_steps = n_steps self.use_noise = use_noise self.name = name assert n_steps > 0, "Illegal value has been provided for n_steps." self.train_timer = Timer("Training function") self.grads_timer = Timer("Computing the grads") self.updates = {} def init_params(self, use_noise=False, mdl_name=None): if not hasattr(self, "children") or not self.children: self.children = [] self.inp_bow_layer = BOWLayer(n_in=self.n_in, n_out=self.bow_size, seq_len=self.max_fact_len, use_inv_cost_mask=False, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("bow_layer")) self.inp_bow_layers = [self.inp_bow_layer] self.out_bow_layer = BOWLayer(n_in=self.n_in, n_out=self.bow_size, seq_len=self.max_fact_len, use_inv_cost_mask=False, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("out_bow_layer")) self.out_bow_layers = [self.out_bow_layer] if not self.share_inp_out_weights: for i in xrange(1, self.n_steps): self.inp_bow_layers += [BOWLayer(n_in=self.n_in, n_out=self.bow_size, seq_len=self.max_fact_len, use_inv_cost_mask=False, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("bow_layer_" + str(i)))] self.out_bow_layers += [BOWLayer(n_in=self.n_in, n_out=self.bow_size, use_inv_cost_mask=False, seq_len=self.max_fact_len, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("out_bow_layer_" + str(i)))] self.q_embed = BOWLayer(n_in=self.n_in, n_out=self.bow_size, use_inv_cost_mask=False, seq_len=self.max_fact_len, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, use_average=False, name=self.pname("q_embed")) self.out_layer = AffineLayer(n_in=self.bow_size, n_out=self.n_out, weight_initializer=self.weight_initializer, bias_initializer=self.bias_initializer, name=self.pname("out_layer")) self.children.extend(self.inp_bow_layers) self.children.extend(self.out_bow_layers) self.children.append(self.out_layer) self.children.append(self.q_embed) self.merge_params() # These are the parameters for the temporal encoding thing: self.T_ins = [] self.T_outs = [] nsteps = 1 if self.share_inp_out_weights else self.n_steps #""" for i in xrange(nsteps): T_in = self.weight_initializer(self.max_seq_len, self.bow_size) self.params[self.pname("TE_in_%d" % i)] = T_in self.T_ins.append(self.params[self.pname("TE_in_%d" % i)]) T_out = self.weight_initializer(self.max_seq_len, self.bow_size) self.params[self.pname("TE_out_%d" % i)] = T_out self.T_outs.append(self.params[self.pname("TE_out_%d" % i)]) #""" if mdl_name: logger.info("Reloading model from %s." % mdl_name) self.params.load(mdl_name) [child.use_params(self.params) for child in self.children] def get_cost(self, use_noise=False, mdl_name=None): X = self.inps[0] q = self.inps[1] y = self.inps[2] mask = self.inps[3] cmask = None probs = self.fprop(X, q, cmask=cmask, mask=mask, use_noise=use_noise, mdl_name=mdl_name) self.cost, self.errors = nll(y, probs) return self.cost, self.errors def get_inspect_fn(self, mdl_name=None): logger.info("Compiling inspect function.") probs, ntm_outs = self.fprop(use_noise=False, mdl_name=mdl_name) inspect_fn = theano.function([self.inps[0], self.inps[1], self.inps[2], self.inps[3]], ntm_outs + [probs], on_unused_input='ignore', name=self.pname("inspect_fn")) return inspect_fn def get_valid_fn(self, mdl_name=None): logger.info("Compiling validation function.") self.cost, self.errors = self.get_cost(use_noise=False, mdl_name=mdl_name) valid_fn = theano.function(self.inps, [self.cost, self.errors], on_unused_input='ignore', name=self.pname("valid_fn")) return valid_fn def add_noise_to_params(self): for k, v in self.params.__dict__['params'].iteritems(): v_np = v.get_value(borrow=True) noise = global_rng.normal(0, 0.05, v_np.shape) self.params[k] = v_np + noise def get_train_fn(self, lr=None, mdl_name=None): if lr is None: lr = self.eps cost, errors = self.get_cost(use_noise=self.use_noise, mdl_name=mdl_name) params = self.params.values logger.info("Computing the gradients.") self.grads_timer.start() grads = safe_grad(cost, params) gnorm = sum(grad.norm(2) for _, grad in grads.iteritems()) updates, norm_up, param_norm = self.learning_rule.get_updates(learning_rate=lr, grads=grads) self.grads_timer.stop() logger.info(self.grads_timer) if not self.updates: self.updates = self.updates.update(updates) logger.info("Compiling the training function.") self.train_timer.start() self.updates = updates outs = [self.cost, gnorm, norm_up, param_norm] outs += [self.errors] train_fn = theano.function(self.inps, outs, updates=updates, mode=self.theano_function_mode, on_unused_input='ignore', name=self.pname("train_fn")) self.train_timer.stop() logger.info(self.train_timer) return train_fn def __get_bow_inps(self, x, q, mask=None, use_noise=False): inp_bow_outs, out_bow_outs = [], [] nsteps = 1 if self.share_inp_out_weights else self.n_steps for i in xrange(nsteps): inp_bow_outs.append(self.inp_bow_layers[i].fprop(x, amask=mask, deterministic=not use_noise)) out_bow_outs.append(self.out_bow_layers[i].fprop(x, amask=mask, deterministic=not use_noise)) return inp_bow_outs, out_bow_outs def dot_componentwise(self, x, u_t): if x.ndim == 3: u_t = u_t.dimshuffle('x', 0, 1) res = (x * u_t).sum(-1) return res def fprop(self, x, q, mask=None, qmask=None, cmask=None, use_noise=False, mdl_name=None): self.init_params(use_noise=use_noise, mdl_name=mdl_name) q_emb = self.q_embed.fprop(q, deterministic=not use_noise) amask = None if mask is not None and cmask is not None: amask = mask * TT.eq(cmask, 0) inp_bow_outs, out_bow_outs = self.__get_bow_inps(x, q, mask=amask, use_noise=use_noise) u_t = q_emb v_t = None if mask.ndim == 2 and \ inp_bow_outs[0].ndim == 3: mask = mask.dimshuffle(0, 1, 'x') for i in xrange(self.n_steps): if not self.share_inp_out_weights: inp_bow = mask * (inp_bow_outs[i] + self.T_ins[i].dimshuffle(0, 'x', 1)) out_bow = mask * (out_bow_outs[i] + self.T_outs[i].dimshuffle(0, 'x', 1)) else: inp_bow = mask * (inp_bow_outs[0] + self.T_ins[0].dimshuffle(0, 'x', 1)) out_bow = mask * (out_bow_outs[0] + self.T_outs[0].dimshuffle(0, 'x', 1)) if u_t.ndim == 2: u_t = u_t.dimshuffle(0, 1, 'x') sims = self.dot_componentwise(inp_bow, u_t) pre_soft = mask.dimshuffle(0, 1) * TT.exp(sims - sims.max(0)) ps = pre_soft / pre_soft.sum(axis=0, keepdims=True) ps = ps.dimshuffle(0, 1, 'x') v_t = (out_bow * ps).sum(0) u_t = u_t.dimshuffle(0, 1) + v_t new_out = u_t pre_logit = self.out_layer.fprop(new_out) probs = Softmax(pre_logit) return probs