def call(self, x, state): with tf.variable_scope(type(self).__name__): h, c = state h_size = self.num_units x_size = x.get_shape().as_list()[1] w_init = aux.orthogonal_initializer(1.0) h_init = aux.orthogonal_initializer(1.0) b_init = tf.constant_initializer(0.0) W_xh = tf.get_variable('W_xh', [x_size, 4 * h_size], initializer=w_init, dtype=tf.float32) W_hh = tf.get_variable('W_hh', [h_size, 4 * h_size], initializer=h_init, dtype=tf.float32) bias = tf.get_variable('bias', [4 * h_size], initializer=b_init, dtype=tf.float32) concat = tf.concat(axis=1, values=[x, h]) # concat for speed. W_full = tf.concat(axis=0, values=[W_xh, W_hh]) concat = tf.matmul(concat, W_full) + bias concat = aux.layer_norm_all(concat, 4, h_size, 'ln') # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = tf.split(axis=1, num_or_size_splits=4, value=concat) new_c = c * tf.sigmoid(f + self.f_bias) + tf.sigmoid(i) * tf.tanh(j) new_h = tf.tanh(aux.layer_norm(new_c, 'ln_c')) * tf.sigmoid(o) if self.use_zoneout: new_h, new_c = aux.zoneout(new_h, new_c, h, c, self.zoneout_keep_h, self.zoneout_keep_c, self.is_training) return new_h, (new_h, new_c)
def __init__(self, hidden_size, activation=None, reuse=None, kernel_initializer=None, bias_initializer=None, T_norm=None, eps=1e-12, use_zoneout=False, zoneout_keep_h=0.9, use_layer_norm=False, is_training=False, lambda_pow=0): """Initialization of the Associative RUM cell. Args: hidden_size: number of neurons in hidden state acitvation_tmp: activation of the temporary new state activation_tar: activation of the target activation_emb: activation of the embedded input T_norm: norm for time normalization, `eta` in the paper eps: the cutoff for the normalizations use_zoneout: zoneout, True or False use_layer_norm: batch normalization, True or False is_training: marker for the zoneout lambda_pow: the power for the associative memory (an integer) """ super(ARUMCell, self).__init__(_reuse=reuse) self._hidden_size = hidden_size self._activation = activation or relu self._T_norm = T_norm self._kernel_initializer = kernel_initializer or aux.orthogonal_initializer( 1.0) self._bias_initializer = bias_initializer self._eps = eps self._use_zoneout = use_zoneout self._zoneout_keep_h = zoneout_keep_h self._use_layer_norm = use_layer_norm self._is_training = is_training self._lambda_pow = lambda_pow
def __init__(self, is_training, config, input_): self._input = input_ batch_size = input_.batch_size num_steps = input_.num_steps emb_size = config.embed_size vocab_size = config.vocab_size F_size = config.cell_size if config.cell != "rum": S_size = config.hyper_size emb_init = aux.orthogonal_initializer(1.0) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, emb_size], initializer=emb_init, dtype=tf.float32) inputs = tf.nn.embedding_lookup(embedding, input_.input_data) if config.cell != "rum": F_cells = [ LNLSTM.LN_LSTMCell(F_size, use_zoneout=True, is_training=is_training, zoneout_keep_h=config.zoneout_h, zoneout_keep_c=config.zoneout_c) for _ in range(config.fast_layers) ] if config.cell == "fs-lstm": S_cell = LNLSTM.LN_LSTMCell(S_size, use_zoneout=True, is_training=is_training, zoneout_keep_h=config.zoneout_h, zoneout_keep_c=config.zoneout_c) elif config.cell == "fs-rum": S_cell = RUM.RUMCell(hidden_size=S_size, T_norm=config.T_norm, use_zoneout=config.use_zoneout, use_layer_norm=config.use_layer_norm, is_training=is_training) elif config.cell == "fs-goru": with tf.variable_scope("goru"): S_cell = GORU.GORUCell(hidden_size=S_size) if config.cell != "rum": FS_cell = FSRNN.FSRNNCell(F_cells, S_cell, config.keep_prob, is_training) self._initial_state = FS_cell.zero_state(batch_size, tf.float32) state = self._initial_state print FS_cell else: def rum_cell(): return RUM.RUMCell(hidden_size=config.cell_size, T_norm=config.T_norm, use_zoneout=config.use_zoneout, use_layer_norm=config.use_layer_norm, is_training=is_training) mcell = MultiRNNCell( [rum_cell() for _ in range(config.num_layers)], state_is_tuple=True) self._initial_state = mcell.zero_state(batch_size, tf.float32) state = self._initial_state print('generating graph') ## Dynamic RNN ## # with tf.variable_scope("RNN"): # if config.cell != 'rum': # outputs, _ = tf.nn.dynamic_rnn(F_cells[0], inputs, dtype=tf.float32) # else: # outputs, _ = tf.nn.dynamic_rnn(mcell, inputs, dtype=tf.float32) ## For Loop RNN ## outputs = [] for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() if config.cell != "rum": out, state = FS_cell(inputs[:, time_step, :], state) else: out, state = mcell(inputs[:, time_step, :], state) outputs.append(out) outputs = tf.concat(axis=1, values=outputs) print('graph generated') outputs = tf.reshape(outputs, [-1, F_size]) # Output layer and cross entropy loss out_init = aux.orthogonal_initializer(1.0) with tf.variable_scope("softmax"): softmax_w = tf.get_variable("softmax_w", [F_size, vocab_size], initializer=out_init, dtype=tf.float32) softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32) logits = tf.matmul(outputs, softmax_w) + softmax_b loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], [tf.reshape(input_.targets, [-1])], [tf.ones([batch_size * num_steps], dtype=tf.float32)]) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = state if not is_training: return # Create the parameter update ops if training self._lr = tf.Variable(0.0, trainable=False, dtype=tf.float32) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(cost, tvars, aggregation_method=tf.AggregationMethod. EXPERIMENTAL_ACCUMULATE_N), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) self._new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr)
def __init__(self, is_training, config, input_): if config.activation == "tanh": act = tf.nn.tanh elif config.activation == "sigmoid": act = tf.nn.sigmoid elif config.activation == "softsign": act = tf.nn.softsign elif config.activation == "relu": act = tf.nn.relu self._input = input_ # prelim batch_size = input_.batch_size num_steps = input_.num_steps emb_size = config.embed_size vocab_size = config.vocab_size F_size = FLAGS.fast_size if FLAGS.fast_size else config.cell_size if config.cell not in ["rum", "lstm"]: S_size = config.hyper_size # embedding emb_init = aux.orthogonal_initializer(1.0) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, emb_size], initializer=emb_init, dtype=tf.float32) inputs = tf.nn.embedding_lookup(embedding, input_.input_data) # construct Fast and Slow states if config.cell not in ["rum", "lstm"]: F_cells = [ LNLSTM.LN_LSTMCell(F_size, use_zoneout=True, is_training=is_training, zoneout_keep_h=config.zoneout_h, zoneout_keep_c=config.zoneout_c) for _ in range(config.fast_layers) ] if config.cell == "fs-lstm": S_cell = LNLSTM.LN_LSTMCell(S_size, use_zoneout=True, is_training=is_training, zoneout_keep_h=config.zoneout_h, zoneout_keep_c=config.zoneout_c) elif config.cell == "fs-rum": S_cell = RUM.RUMCell( S_size, # eta_=config.T_norm, eta_=FLAGS.eta, use_zoneout=config.use_zoneout, use_layer_norm=config.use_layer_norm, is_training=is_training, activation=act) elif config.cell == "fs-goru": with tf.variable_scope("goru"): S_cell = GORU.GORUCell(hidden_size=S_size) # test pure RUM/LSTM models (room for experiments) if config.cell == "rum": def rum_cell(): return RUM.RUMCell(F_size, eta_=FLAGS.eta, use_zoneout=config.use_zoneout, use_layer_norm=config.use_layer_norm, is_training=is_training, update_gate=config.update_gate, lambda_=0, activation=act) mcell = MultiRNNCell( [rum_cell() for _ in range(config.num_layers)], state_is_tuple=True) self._initial_state = mcell.zero_state(batch_size, tf.float32) state = self._initial_state print(colored(mcell, "yellow")) elif config.cell == "lstm": def lstm_cell(): return LNLSTM.LN_LSTMCell(F_size, use_zoneout=True, is_training=is_training, zoneout_keep_h=config.zoneout_h, zoneout_keep_c=config.zoneout_c) mcell = MultiRNNCell( [lstm_cell() for _ in range(config.num_layers)], state_is_tuple=True) self._initial_state = mcell.zero_state(batch_size, tf.float32) state = self._initial_state print(colored(mcell, "yellow")) else: FS_cell = FSRNN.FSRNNCell(F_cells, S_cell, config.keep_prob, is_training) self._initial_state = FS_cell.zero_state(batch_size, tf.float32) state = self._initial_state print(colored(FS_cell, "yellow")) outputs = [] print(colored('generating graph', "blue")) with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() if config.cell not in ["rum", "lstm"]: out, state = FS_cell(inputs[:, time_step, :], state) else: out, state = mcell(inputs[:, time_step, :], state) outputs.append(out) print(colored('graph generated', "blue")) output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, F_size]) # Output layer and cross entropy loss out_init = aux.orthogonal_initializer(1.0) softmax_w = tf.get_variable("softmax_w", [F_size, vocab_size], initializer=out_init, dtype=tf.float32) softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32) logits = tf.matmul(output, softmax_w) + softmax_b loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], [tf.reshape(input_.targets, [-1])], [tf.ones([batch_size * num_steps], dtype=tf.float32)]) self._cost = cost = tf.reduce_sum(loss) / batch_size tf.summary.scalar('cost', cost) self._final_state = state if not is_training: return # Create the parameter update ops if training self._lr = tf.Variable(0.0, trainable=False, dtype=tf.float32) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(cost, tvars, aggregation_method=tf.AggregationMethod. EXPERIMENTAL_ACCUMULATE_N), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) self._new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr)