Exemple #1
0
    def call(self, x, state):
        with tf.variable_scope(type(self).__name__):
            h, c = state

            h_size = self.num_units
            x_size = x.get_shape().as_list()[1]

            w_init = aux.orthogonal_initializer(1.0)
            h_init = aux.orthogonal_initializer(1.0)
            b_init = tf.constant_initializer(0.0)

            W_xh = tf.get_variable('W_xh',
                                   [x_size, 4 * h_size], initializer=w_init, dtype=tf.float32)
            W_hh = tf.get_variable('W_hh',
                                   [h_size, 4 * h_size], initializer=h_init, dtype=tf.float32)
            bias = tf.get_variable('bias', [4 * h_size], initializer=b_init, dtype=tf.float32)

            concat = tf.concat(axis=1, values=[x, h])  # concat for speed.
            W_full = tf.concat(axis=0, values=[W_xh, W_hh])
            concat = tf.matmul(concat, W_full) + bias
            concat = aux.layer_norm_all(concat, 4, h_size, 'ln')

            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            i, j, f, o = tf.split(axis=1, num_or_size_splits=4, value=concat)

            new_c = c * tf.sigmoid(f + self.f_bias) + tf.sigmoid(i) * tf.tanh(j)
            new_h = tf.tanh(aux.layer_norm(new_c, 'ln_c')) * tf.sigmoid(o)

            if self.use_zoneout:
                new_h, new_c = aux.zoneout(new_h, new_c, h, c, self.zoneout_keep_h,
                                           self.zoneout_keep_c, self.is_training)

        return new_h, (new_h, new_c)
Exemple #2
0
    def __init__(self,
                 hidden_size,
                 activation=None,
                 reuse=None,
                 kernel_initializer=None,
                 bias_initializer=None,
                 T_norm=None,
                 eps=1e-12,
                 use_zoneout=False,
                 zoneout_keep_h=0.9,
                 use_layer_norm=False,
                 is_training=False,
                 lambda_pow=0):
        """Initialization of the Associative RUM cell.

		Args: 
			hidden_size: number of neurons in hidden state 
			acitvation_tmp: activation of the temporary new state 
			activation_tar: activation of the target 
			activation_emb: activation of the embedded input 
			T_norm: norm for time normalization, `eta` in the paper 
			eps: the cutoff for the normalizations
			use_zoneout: zoneout, True or False 
			use_layer_norm: batch normalization, True or False
			is_training: marker for the zoneout 
			lambda_pow: the power for the associative memory (an integer)
		"""
        super(ARUMCell, self).__init__(_reuse=reuse)
        self._hidden_size = hidden_size
        self._activation = activation or relu
        self._T_norm = T_norm
        self._kernel_initializer = kernel_initializer or aux.orthogonal_initializer(
            1.0)
        self._bias_initializer = bias_initializer
        self._eps = eps
        self._use_zoneout = use_zoneout
        self._zoneout_keep_h = zoneout_keep_h
        self._use_layer_norm = use_layer_norm
        self._is_training = is_training
        self._lambda_pow = lambda_pow
Exemple #3
0
    def __init__(self, is_training, config, input_):
        self._input = input_

        batch_size = input_.batch_size
        num_steps = input_.num_steps
        emb_size = config.embed_size
        vocab_size = config.vocab_size
        F_size = config.cell_size
        if config.cell != "rum":
            S_size = config.hyper_size

        emb_init = aux.orthogonal_initializer(1.0)
        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, emb_size],
                                        initializer=emb_init,
                                        dtype=tf.float32)
            inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

        if config.cell != "rum":
            F_cells = [
                LNLSTM.LN_LSTMCell(F_size,
                                   use_zoneout=True,
                                   is_training=is_training,
                                   zoneout_keep_h=config.zoneout_h,
                                   zoneout_keep_c=config.zoneout_c)
                for _ in range(config.fast_layers)
            ]
        if config.cell == "fs-lstm":
            S_cell = LNLSTM.LN_LSTMCell(S_size,
                                        use_zoneout=True,
                                        is_training=is_training,
                                        zoneout_keep_h=config.zoneout_h,
                                        zoneout_keep_c=config.zoneout_c)
        elif config.cell == "fs-rum":
            S_cell = RUM.RUMCell(hidden_size=S_size,
                                 T_norm=config.T_norm,
                                 use_zoneout=config.use_zoneout,
                                 use_layer_norm=config.use_layer_norm,
                                 is_training=is_training)
        elif config.cell == "fs-goru":
            with tf.variable_scope("goru"):
                S_cell = GORU.GORUCell(hidden_size=S_size)
        if config.cell != "rum":
            FS_cell = FSRNN.FSRNNCell(F_cells, S_cell, config.keep_prob,
                                      is_training)
            self._initial_state = FS_cell.zero_state(batch_size, tf.float32)
            state = self._initial_state
            print FS_cell
        else:

            def rum_cell():
                return RUM.RUMCell(hidden_size=config.cell_size,
                                   T_norm=config.T_norm,
                                   use_zoneout=config.use_zoneout,
                                   use_layer_norm=config.use_layer_norm,
                                   is_training=is_training)

            mcell = MultiRNNCell(
                [rum_cell() for _ in range(config.num_layers)],
                state_is_tuple=True)
            self._initial_state = mcell.zero_state(batch_size, tf.float32)
            state = self._initial_state
        print('generating graph')

        ## Dynamic RNN ##
        # with tf.variable_scope("RNN"):
        # if config.cell != 'rum':
        #     outputs, _ = tf.nn.dynamic_rnn(F_cells[0], inputs, dtype=tf.float32)
        # else:
        #     outputs, _ = tf.nn.dynamic_rnn(mcell, inputs, dtype=tf.float32)

        ## For Loop RNN ##
        outputs = []
        for time_step in range(num_steps):
            if time_step > 0: tf.get_variable_scope().reuse_variables()
            if config.cell != "rum":
                out, state = FS_cell(inputs[:, time_step, :], state)
            else:
                out, state = mcell(inputs[:, time_step, :], state)
            outputs.append(out)
        outputs = tf.concat(axis=1, values=outputs)
        print('graph generated')
        outputs = tf.reshape(outputs, [-1, F_size])

        # Output layer and cross entropy loss

        out_init = aux.orthogonal_initializer(1.0)
        with tf.variable_scope("softmax"):
            softmax_w = tf.get_variable("softmax_w", [F_size, vocab_size],
                                        initializer=out_init,
                                        dtype=tf.float32)
            softmax_b = tf.get_variable("softmax_b", [vocab_size],
                                        dtype=tf.float32)
            logits = tf.matmul(outputs, softmax_w) + softmax_b
            loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
                [logits], [tf.reshape(input_.targets, [-1])],
                [tf.ones([batch_size * num_steps], dtype=tf.float32)])
            self._cost = cost = tf.reduce_sum(loss) / batch_size
            self._final_state = state

        if not is_training: return

        # Create the parameter update ops if training

        self._lr = tf.Variable(0.0, trainable=False, dtype=tf.float32)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(cost,
                         tvars,
                         aggregation_method=tf.AggregationMethod.
                         EXPERIMENTAL_ACCUMULATE_N), config.max_grad_norm)
        optimizer = tf.train.AdamOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        self._new_lr = tf.placeholder(tf.float32,
                                      shape=[],
                                      name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)
    def __init__(self, is_training, config, input_):
        if config.activation == "tanh":
            act = tf.nn.tanh
        elif config.activation == "sigmoid":
            act = tf.nn.sigmoid
        elif config.activation == "softsign":
            act = tf.nn.softsign
        elif config.activation == "relu":
            act = tf.nn.relu

        self._input = input_

        # prelim
        batch_size = input_.batch_size
        num_steps = input_.num_steps
        emb_size = config.embed_size
        vocab_size = config.vocab_size
        F_size = FLAGS.fast_size if FLAGS.fast_size else config.cell_size
        if config.cell not in ["rum", "lstm"]:
            S_size = config.hyper_size

        # embedding
        emb_init = aux.orthogonal_initializer(1.0)
        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, emb_size],
                                        initializer=emb_init,
                                        dtype=tf.float32)
            inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

        # construct Fast and Slow states
        if config.cell not in ["rum", "lstm"]:
            F_cells = [
                LNLSTM.LN_LSTMCell(F_size,
                                   use_zoneout=True,
                                   is_training=is_training,
                                   zoneout_keep_h=config.zoneout_h,
                                   zoneout_keep_c=config.zoneout_c)
                for _ in range(config.fast_layers)
            ]
        if config.cell == "fs-lstm":
            S_cell = LNLSTM.LN_LSTMCell(S_size,
                                        use_zoneout=True,
                                        is_training=is_training,
                                        zoneout_keep_h=config.zoneout_h,
                                        zoneout_keep_c=config.zoneout_c)
        elif config.cell == "fs-rum":

            S_cell = RUM.RUMCell(
                S_size,
                # eta_=config.T_norm,
                eta_=FLAGS.eta,
                use_zoneout=config.use_zoneout,
                use_layer_norm=config.use_layer_norm,
                is_training=is_training,
                activation=act)
        elif config.cell == "fs-goru":
            with tf.variable_scope("goru"):
                S_cell = GORU.GORUCell(hidden_size=S_size)
        # test pure RUM/LSTM models (room for experiments)
        if config.cell == "rum":

            def rum_cell():
                return RUM.RUMCell(F_size,
                                   eta_=FLAGS.eta,
                                   use_zoneout=config.use_zoneout,
                                   use_layer_norm=config.use_layer_norm,
                                   is_training=is_training,
                                   update_gate=config.update_gate,
                                   lambda_=0,
                                   activation=act)

            mcell = MultiRNNCell(
                [rum_cell() for _ in range(config.num_layers)],
                state_is_tuple=True)
            self._initial_state = mcell.zero_state(batch_size, tf.float32)
            state = self._initial_state
            print(colored(mcell, "yellow"))
        elif config.cell == "lstm":

            def lstm_cell():
                return LNLSTM.LN_LSTMCell(F_size,
                                          use_zoneout=True,
                                          is_training=is_training,
                                          zoneout_keep_h=config.zoneout_h,
                                          zoneout_keep_c=config.zoneout_c)

            mcell = MultiRNNCell(
                [lstm_cell() for _ in range(config.num_layers)],
                state_is_tuple=True)
            self._initial_state = mcell.zero_state(batch_size, tf.float32)
            state = self._initial_state
            print(colored(mcell, "yellow"))
        else:
            FS_cell = FSRNN.FSRNNCell(F_cells, S_cell, config.keep_prob,
                                      is_training)
            self._initial_state = FS_cell.zero_state(batch_size, tf.float32)
            state = self._initial_state
            print(colored(FS_cell, "yellow"))

        outputs = []
        print(colored('generating graph', "blue"))
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0:
                    tf.get_variable_scope().reuse_variables()
                if config.cell not in ["rum", "lstm"]:
                    out, state = FS_cell(inputs[:, time_step, :], state)
                else:
                    out, state = mcell(inputs[:, time_step, :], state)
                outputs.append(out)

        print(colored('graph generated', "blue"))
        output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, F_size])

        # Output layer and cross entropy loss

        out_init = aux.orthogonal_initializer(1.0)
        softmax_w = tf.get_variable("softmax_w", [F_size, vocab_size],
                                    initializer=out_init,
                                    dtype=tf.float32)
        softmax_b = tf.get_variable("softmax_b", [vocab_size],
                                    dtype=tf.float32)
        logits = tf.matmul(output, softmax_w) + softmax_b
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(input_.targets, [-1])],
            [tf.ones([batch_size * num_steps], dtype=tf.float32)])
        self._cost = cost = tf.reduce_sum(loss) / batch_size
        tf.summary.scalar('cost', cost)

        self._final_state = state

        if not is_training:
            return

        # Create the parameter update ops if training

        self._lr = tf.Variable(0.0, trainable=False, dtype=tf.float32)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(cost,
                         tvars,
                         aggregation_method=tf.AggregationMethod.
                         EXPERIMENTAL_ACCUMULATE_N), config.max_grad_norm)
        optimizer = tf.train.AdamOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        self._new_lr = tf.placeholder(tf.float32,
                                      shape=[],
                                      name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)