コード例 #1
0
    def __init__(self, is_training, config):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        size = config.hidden_size
        vocab_size = config.vocab_size

        self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])

        # Slightly better results can be obtained with forget gate biases
        # initialized to 1 but the hyperparameters of the model would need to be
        # different than reported in the paper.
        lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
        if is_training and config.keep_prob < 1:
            lstm_cell = rnn_cell.DropoutWrapper(
                lstm_cell, output_keep_prob=config.keep_prob)
        cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers)

        self._initial_state = cell.zero_state(batch_size, tf.float32)

        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, size])
            inputs = tf.nn.embedding_lookup(embedding, self._input_data)

        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        # Simplified version of tensorflow.models.rnn.rnn.py's rnn().
        # This builds an unrolled LSTM for tutorial purposes only.
        # In general, use the rnn() or state_saving_rnn() from rnn.py.
        #
        # The alternative version of the code below is:
        #
        # from tensorflow.models.rnn import rnn
        # inputs = [tf.squeeze(input_, [1])
        #           for input_ in tf.split(1, num_steps, inputs)]
        # outputs, state = rnn.rnn(cell, inputs, initial_state=self._initial_state)
        outputs = []
        state = self._initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)

        output = tf.reshape(tf.concat(1, outputs), [-1, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        logits = tf.matmul(output, softmax_w) + softmax_b
        loss = seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(self._targets, [-1])],
            [tf.ones([batch_size * num_steps])])
        self._cost = cost = tf.reduce_sum(loss) / batch_size
        self._final_state = state

        if not is_training:
            return

        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))
コード例 #2
0
        m = PTBModel(is_training=False, config=config)
    saver = tf.train.Saver()
    saver.restore(session, "/Users/marting/scratch/tensorflow/model.ckpt")
    with tf.variable_scope("model", reuse=True):
        print("Model restored.")
        embedding = tf.get_variable("embedding", [vocab_size, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        softmax_w = session.run(softmax_w)
        softmax_b = session.run(softmax_b)
        embedding = session.run(embedding)
        nextword = 'food'
        wordvec = embedding[voc[nextword]]
        print(wordvec)
        lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
        cell = rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
        state = cell.zero_state(1, tf.float32)
        input = tf.convert_to_tensor(wordvec)
        with tf.variable_scope("RNN"):
            #state = tf.reshape(state, [1,800])
            input = tf.reshape(input, [1, 200])
            (cell_output, state) = cell(input, state)
            cell_output = cell_output.eval()
            print(cell_output.shape)
            nextword = cov[np.argmax((cell_output.dot(softmax_w) + softmax_b))]
            print(nextword)

with tf.Graph().as_default(), tf.Session() as session:
    with tf.variable_scope("model", reuse=True):
        m = PTBModel(is_training=False, config=config)
        saver = tf.train.Saver()
コード例 #3
0
 def __init__(self, is_training, length):
     self.batch_size = batch_size = FLAGS.batch_size
     self.num_steps = num_steps = length
     hidden_size = FLAGS.hidden_dim
     
     self._input_data = tf.placeholder(tf.float32, [batch_size, None, FLAGS.input_dim])
     self._targets = tf.placeholder(tf.float32, [batch_size, None, FLAGS.output_dim])
     
     if FLAGS.model == "rnn":
         vanilla_rnn_cell = rnn_cell.BasicRNNCell(num_units=FLAGS.hidden_dim)
         if is_training and FLAGS.keep_prob < 1:
             vanilla_rnn_cell = rnn_cell.DropoutWrapper(vanilla_rnn_cell, 
                                                        output_keep_prob=FLAGS.keep_prob)
         if FLAGS.layer == 1:
             cell = vanilla_rnn_cell
         elif FLAGS.layer == 2:
             cell = rnn_cell.MultiRNNCell([vanilla_rnn_cell] * 2)
     elif FLAGS.model == "lstm":
         lstm_cell = rnn_cell.BasicLSTMCell(num_units=FLAGS.hidden_dim,
                                            forget_bias=1.0)
         if is_training and FLAGS.keep_prob < 1:
             lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, 
                                                 output_keep_prob=FLAGS.keep_prob)
         if FLAGS.layer == 1:
             cell = lstm_cell
         elif FLAGS.layer == 2:
             cell = rnn_cell.MultiRNNCell([lstm_cell] * 2)
     elif FLAGS.model == "gru":
         gru_cell = rnn_cell.GRUCell(num_units=FLAGS.hidden_dim)
         if is_training and FLAGS.keep_prob < 1:
             gru_cell = rnn_cell.DropoutWrapper(gru_cell, 
                                                output_keep_prob=FLAGS.keep_prob)
         cell = gru_cell
     else:
         raise ValueError("Invalid model: %s", FLAGS.model)
     
     self._initial_state = cell.zero_state(batch_size, tf.float32)
     
     outputs = []
     state = self._initial_state
     with tf.variable_scope("RNN"):
         for time_step in range(num_steps):
             if time_step > 0:
                 tf.get_variable_scope().reuse_variables()
             (cell_output, state) = cell(self._input_data[:, time_step, :], state)
             outputs.append(cell_output)
     self._final_state = state
     
     hidden_output = tf.reshape(tf.concat(1, outputs), [-1, hidden_size])
     
     V_1 = tf.get_variable("v_1", shape=[hidden_size, FLAGS.output_dim],
       initializer=tf.random_uniform_initializer(-tf.sqrt(1./hidden_size),tf.sqrt(1./hidden_size)))
     b_1 = tf.get_variable("b_1", shape=[FLAGS.output_dim], initializer=tf.constant_initializer(0.1))
     logits = tf.add(tf.matmul(hidden_output, V_1), b_1)
     
     target = tf.reshape(self._targets, [-1, FLAGS.output_dim])
     training_loss = tf.reduce_sum(tf.pow(logits-target, 2)) / 2        
     mse = tf.reduce_mean(tf.pow(logits-target, 2))        
     self._cost = mse
     
     if not is_training:
         return
     
     self._lr = tf.Variable(0.0, trainable=False)
     tvars = tf.trainable_variables()
     grads, _ = tf.clip_by_global_norm(tf.gradients(training_loss, tvars), FLAGS.max_grad_norm)
     optimizer = tf.train.GradientDescentOptimizer(self.lr)
     self._train_op = optimizer.apply_gradients(zip(grads, tvars))
コード例 #4
0
ファイル: optimizer.py プロジェクト: synpon/optimization
    def __init__(self):
        # Input
        self.point = tf.placeholder(tf.float32, [m, 1],
                                    'points')  # Used in training only
        self.variances = tf.placeholder(tf.float32, [k, 1], 'variances')
        self.weights = tf.placeholder(tf.float32, [k, 1], 'weights')
        self.hyperplanes = tf.placeholder(
            tf.float32, [m, m, k],
            'hyperplanes')  # Points which define the hyperplanes

        if rnn_type == 'lstm':
            self.initial_rnn_state = tf.placeholder_with_default(
                input=tf.zeros([m, 2 * num_rnn_layers * rnn_size]),
                shape=[None, 2 * num_rnn_layers * rnn_size])
        else:
            # initial_rnn_state is passed during evaluation but not during training
            # each dimension has an independent hidden state, required in order to simulate Adam, RMSProp etc.
            self.initial_rnn_state = tf.placeholder_with_default(
                input=tf.zeros([m, num_rnn_layers * rnn_size]),
                shape=[None, num_rnn_layers * rnn_size])

        # The scope allows these variables to be excluded from being reinitialized during the comparison phase
        with tf.variable_scope("optimizer"):
            if rnn_type == 'rnn':
                cell = rnn_cell.BasicRNNCell(rnn_size)
            elif rnn_type == 'gru':
                cell = rnn_cell.GRUCell(rnn_size)
            elif rnn_type == 'lstm':
                cell = rnn_cell.LSTMCell(rnn_size)

            self.cell = rnn_cell.MultiRNNCell([cell] * num_rnn_layers)

            updates = []
            snf_losses = []

            # Arguments passed to the condition and body functions
            time = tf.constant(0)
            point = self.point

            snf_loss = snf.calc_snf_loss_tf(point, self.hyperplanes,
                                            self.variances, self.weights)
            snf_losses.append(snf_loss)
            snf_grads = snf.calc_grads_tf(snf_loss, point)
            snf_grads = tf.squeeze(snf_grads, [0])

            snf_loss_ta = tf.TensorArray(dtype=tf.float32, size=seq_length)
            update_ta = tf.TensorArray(dtype=tf.float32, size=seq_length)
            rnn_state = tf.zeros([m, rnn_size * num_rnn_layers])

            loop_vars = [
                time, point, snf_grads, rnn_state, snf_loss_ta, update_ta,
                self.hyperplanes, self.variances, self.weights
            ]

            def condition(time, point, snf_grads, rnn_state, snf_loss_ta,
                          update_ta, hyperplanes, variances, weights):
                return tf.less(time, seq_length)

            def body(time, point, snf_grads, rnn_state, snf_loss_ta, update_ta,
                     hyperplanes, variances, weights):

                h, rnn_state_out = self.cell(snf_grads, rnn_state)

                # Final layer of the optimizer
                # Cannot use fc_layer due to a 'must be from the same frame' error
                d = np.sqrt(1.0) / np.sqrt(
                    rnn_size + 1)  ### should be sqrt(2, 3 or 6?)
                initializer = tf.random_uniform_initializer(-d, d)
                W = tf.get_variable("W", [rnn_size, 1],
                                    initializer=initializer)

                # No bias, linear activation function
                update = tf.matmul(h, W)
                update = tf.reshape(update, [m, 1])
                update = inv_scale_grads(update)

                new_point = point + update

                snf_loss = snf.calc_snf_loss_tf(new_point, hyperplanes,
                                                variances, weights)
                snf_losses.append(snf_loss)

                snf_loss_ta = snf_loss_ta.write(time, snf_loss)
                update_ta = update_ta.write(time, update)

                snf_grads_out = snf.calc_grads_tf(snf_loss, point)
                snf_grads_out = tf.reshape(snf_grads_out, [m, 1])

                time += 1
                return [
                    time, new_point, snf_grads_out, rnn_state_out, snf_loss_ta,
                    update_ta, hyperplanes, variances, weights
                ]

            # Do the computation
            with tf.variable_scope("o1"):
                res = tf.while_loop(condition, body, loop_vars)

            self.new_point = res[1]
            self.rnn_state_out = res[3]
            losses = res[4].pack()
            updates = res[5].pack()

            # Total change in the SNF loss
            # Improvement: 2 - 3 = -1 (small loss)
            snf_loss_change = losses[seq_length - 1] - losses[0]
            snf_loss_change = tf.maximum(snf_loss_change, loss_asymmetry *
                                         snf_loss_change)  # Asymmetric loss
            self.loss_change_sign = tf.sign(snf_loss_change)

            # Oscillation cost
            overall_update = tf.zeros([m, 1])
            norm_sum = 0.0

            for i in range(seq_length):
                overall_update += updates[i, :, :]
                norm_sum += tf_norm(updates[i, :, :])

            osc_cost = norm_sum / tf_norm(overall_update)  # > 1

            self.total_loss = snf_loss_change * tf.pow(
                osc_cost, tf.sign(snf_loss_change))

            #===# Model training #===#
            #opt = tf.train.RMSPropOptimizer(0.01,momentum=0.5)
            opt = tf.train.AdamOptimizer()
            vars = tf.trainable_variables()

            gvs = opt.compute_gradients(self.total_loss, vars)

            self.gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var)
                        for (grad, var) in gvs]

            self.grads_input = [(tf.placeholder(tf.float32,
                                                shape=v.get_shape()), v)
                                for (g, v) in gvs]
            self.train_step = opt.apply_gradients(self.grads_input)

            #===# Comparison code #===#
            self.input_grads = tf.placeholder(
                tf.float32, [1, None, 1],
                'input_grads')  ### Remove first dimension?
            input_grads = tf.squeeze(self.input_grads, [0])

            with tf.variable_scope("o1", reuse=True) as scope:
                h, self.rnn_state_out_compare = self.cell(
                    input_grads, self.initial_rnn_state)

                W = tf.get_variable("W")
                update = tf.matmul(h, W)

                update = tf.reshape(update, [-1, 1])
                self.update = inv_scale_grads(update)
コード例 #5
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 hidden_edim,
                 hidden_units,
                 num_layers,
                 keep_prob,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 beam_size,
                 use_lstm=False,
                 forward_only=False):
        """Create the model.

        Args:
            source_vocab_size: size of the source vocabulary.
            target_vocab_size: size of the target vocabulary.
            buckets: a list of pairs (I, O), where I specifies maximum input length
                that will be processed in that bucket, and O specifies maximum output
                length. Training instances that have inputs longer than I or outputs
                longer than O will be pushed to the next bucket and padded accordingly.
                We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
            hidden_edim: number of dimensions for word embedding
            hidden_units: number of hidden units for each layer
            num_layers: number of layers in the model.
            max_gradient_norm: gradients will be clipped to maximally this norm.
            batch_size: the size of the batches used during training;
                the model construction is independent of batch_size, so it can be
                changed after initialization if this is convenient, e.g., for decoding.
            learning_rate: learning rate to start with.
            learning_rate_decay_factor: decay learning rate by this much when needed.
            beam_size: the beam size used in beam search.
            use_lstm: if true, we use LSTM cells instead of GRU cells.
            forward_only: if set, we do not construct the backward pass in the model.
        """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        def loss_function(logit, target, output_projection):
            logit = math_ops.matmul(logit, output_projection, transpose_b=True)
            target = array_ops.reshape(target, [-1])
            crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(
                logit, target)
            return crossent

        softmax_loss_function = loss_function

        # Create the internal multi-layer cell for our RNN.
        single_cell = rnn_cell.GRUCell(hidden_units)
        if use_lstm:
            single_cell = rnn_cell.BasicLSTMCell(
                hidden_units)  # added by yfeng
        cell = single_cell
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)
        if not forward_only:
            cell = rnn_cell.DropoutWrapper(cell,
                                           input_keep_prob=keep_prob,
                                           seed=SEED)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, encoder_mask, encoder_probs, encoder_ids,
                      encoder_hs, mem_mask, decoder_inputs, do_decode):
            return seq2seq_fy.embedding_attention_seq2seq(
                encoder_inputs,
                encoder_mask,
                encoder_probs,
                encoder_ids,
                encoder_hs,
                mem_mask,
                decoder_inputs,
                cell,
                num_encoder_symbols=source_vocab_size,
                num_decoder_symbols=target_vocab_size,
                embedding_size=hidden_edim,
                beam_size=beam_size,
                num_layers=num_layers,
                feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        self.decoder_aligns = []
        self.decoder_align_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))

        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))
            self.decoder_aligns.append(
                tf.placeholder(tf.float32,
                               shape=[None, None],
                               name="align{0}".format(i)))
            self.decoder_align_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="align_weight{0}".format(i)))
        self.encoder_mask = tf.placeholder(tf.int32,
                                           shape=[None, None],
                                           name="encoder_mask")
        self.encoder_probs = tf.placeholder(
            tf.float32,
            shape=[None, None, self.target_vocab_size],
            name="encoder_prob")
        self.encoder_ids = tf.placeholder(tf.int32,
                                          shape=[None, None],
                                          name="encoder_id")
        self.encoder_hs = tf.placeholder(tf.float32,
                                         shape=[None, None, None],
                                         name="encoder_h")
        self.mem_mask = tf.placeholder(tf.float32,
                                       shape=[None, None],
                                       name="mem_mask")

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses, self.symbols = seq2seq_fy.model_with_buckets(
                self.encoder_inputs,
                self.encoder_mask,
                self.encoder_probs,
                self.encoder_ids,
                self.encoder_hs,
                self.mem_mask,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.decoder_aligns,
                self.decoder_align_weights,
                buckets,
                lambda x, y, z, s, a, b, c: seq2seq_f(x, y, z, s, a, b, c, True
                                                      ),
                softmax_loss_function=softmax_loss_function)
        else:
            self.outputs, self.losses, self.symbols = seq2seq_fy.model_with_buckets(
                self.encoder_inputs,
                self.encoder_mask,
                self.encoder_probs,
                self.encoder_ids,
                self.encoder_hs,
                self.mem_mask,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.decoder_aligns,
                self.decoder_align_weights,
                buckets,
                lambda x, y, z, s, a, b, c: seq2seq_f(x, y, z, s, a, b, c,
                                                      False),
                softmax_loss_function=softmax_loss_function)

        # only update memory attention parameters
        params_to_update = [
            p for p in tf.trainable_variables() if p.name in [
                u'beta1_power:0', u'beta2_power:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0/Adam:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0/Adam_1:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0/Adam:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0/Adam_1:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix/Adam:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix/Adam_1:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias/Adam:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias/Adam_1:0'
            ]
        ]
        if not forward_only:
            self.gradient_norms = []
            self.gradient_norms_print = []
            self.updates = []
            opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(
                    self.losses[b],
                    params_to_update,
                    aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients,
                                            params_to_update),
                                        global_step=self.global_step))

        # load trained NMT parameters
        params_to_load = [
            p for p in tf.all_variables() if p.name not in [
                u'beta1_power:0', u'beta2_power:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0/Adam:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0/Adam_1:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0/Adam:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0/Adam_1:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix/Adam:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix/Adam_1:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias/Adam:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias/Adam_1:0'
            ]
        ]

        # only save memory attention parameters
        params_to_save = [
            p for p in tf.all_variables() if p.name in [
                u'Variable:0',
                u'Variable_1:0',
                u'beta1_power:0',
                u'beta2_power:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0/Adam:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0/Adam_1:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0/Adam:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0/Adam_1:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix/Adam:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix/Adam_1:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias/Adam:0',
                u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias/Adam_1:0',
            ]
        ]

        self.saver_old = tf.train.Saver(params_to_load,
                                        max_to_keep=1000,
                                        keep_checkpoint_every_n_hours=6)
        self.saver = tf.train.Saver(params_to_save,
                                    max_to_keep=1000,
                                    keep_checkpoint_every_n_hours=6)
コード例 #6
0
ファイル: seq2seq_model.py プロジェクト: CCIIPLab/EACM
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 hidden_size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 num_samples=-1,
                 embedding_size=200,
                 forward_only=False,
                 beam_search=False,
                 beam_size=10,
                 category=6,
                 use_emb=False,
                 use_autoEM=False,
                 use_imemory=False,
                 use_ememory=False,
                 emotion_size=200,
                 imemory_size=256,
                 dtype=tf.float32):


        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(
                float(learning_rate), trainable=False, dtype=dtype)
        self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)



        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None

        # Sampled softmax only makes sense if we sample less than vocabulary hidden_size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w_t = tf.get_variable("proj_w", [self.target_vocab_size, hidden_size], dtype=dtype)
            w = tf.transpose(w_t)
            b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                labels = tf.reshape(labels, [-1, 1])
                # We need to compute the sampled_softmax_loss using 32bit floats to
                # avoid numerical instabilities.
                local_w_t = tf.cast(w_t, tf.float32)
                local_b = tf.cast(b, tf.float32)
                local_inputs = tf.cast(inputs, tf.float32)
                return tf.cast(
                        tf.nn.sampled_softmax_loss(local_w_t, local_b, local_inputs, labels,
                                                                        num_samples, self.target_vocab_size), dtype)
            softmax_loss_function = sampled_loss
        else:
            w_t = tf.get_variable("proj_w", [self.target_vocab_size, hidden_size], dtype=dtype)
            w = tf.transpose(w_t)
            b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
            output_projection = (w, b)

        # Create the internal multi-layer cell for our RNN.
        gru = tf.nn.rnn_cell.GRUCell(hidden_size)
        encoder_cell = gru
        if num_layers > 1:
            encoder_cell = rnn_cell.MultiRNNCell([gru] * num_layers)
        # Create the internal multi-layer cell for our RNN.
        decoder_cell = encoder_cell
        if use_imemory or use_emb:
            decoder_cell = rnn_cell.MEMGRUCell(hidden_size)
            if num_layers > 1:
                decoder_cell = rnn_cell.MEMMultiRNNCell([decoder_cell]+[gru] * (num_layers-1))

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, encoder_emotions, decoder_emotions, do_decode, autoEM_logit):
            return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    encoder_emotions,
                    decoder_emotions,
                    autoEM_logit,
                    encoder_cell,
                    decoder_cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=embedding_size,
                    hidden_size=hidden_size,
                    emotion_category=category,
                    emotion_size=emotion_size,
                    imemory_size=imemory_size,
                    use_emb=use_emb,
                    use_imemory=use_imemory,
                    use_ememory=use_ememory,
                    output_projection=output_projection,
                    initial_state_attention=True,
                    feed_previous=do_decode,
                    dtype=dtype,
                    beam_search=beam_search,
                    beam_size=beam_size)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in range(buckets[-1][0]):    # Last bucket is the biggest one.
            self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))
        for i in range(buckets[-1][1] + 1):
            self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i)))
            self.target_weights.append(tf.placeholder(dtype, shape=[None], name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [self.decoder_inputs[i + 1]
                             for i in range(len(self.decoder_inputs) - 1)]
        
        self.decoder_emotions = tf.placeholder(tf.int32, shape=[None,2], name="decoder_emotion")
        self.encoder_emotions = tf.placeholder(tf.int32, shape=[None,2], name="encoder_emotion")

        if use_autoEM:
            senti_cell = tf.nn.rnn_cell.GRUCell(hidden_size)
            grammar_cell = tf.nn.rnn_cell.GRUCell(hidden_size)
            if num_layers > 1:
                senti_cell = rnn_cell.MultiRNNCell([senti_cell] * num_layers)
                grammar_cell = rnn_cell.MultiRNNCell([grammar_cell] * num_layers)
            self.autoEM_losses, self.pos_logits, self.res_logits, self.res_losses ,self.res_cross_entropy,self.rlabels,self.weight = seq2seq.classify_model_with_buckets(senti_cell,
                        grammar_cell, self.encoder_inputs, self.encoder_emotions, self.decoder_emotions, buckets,
                         hidden_size, embedding_size, category, source_vocab_size)

            self.pos_predics = []
            self.res_predics = []
            for each in self.pos_logits:
                self.pos_predics.append(tf.arg_max(each, 1))
            for each in self.res_logits:
                self.res_predics.append(tf.arg_max(each, 1))


        # Training outputs and losses.
        if forward_only:
            if beam_search:
                self.outputs, self.beam_results, self.beam_symbols, self.beam_parents = seq2seq.decode_model_with_buckets(
                    self.encoder_inputs, self.decoder_inputs, targets,
                    self.target_weights, self.encoder_emotions, self.decoder_emotions, buckets,
                    lambda w, x, y, z,m: seq2seq_f(w, x, y, z, True, m),
                    softmax_loss_function=softmax_loss_function)
            else:
                self.outputs, self.losses, self.ppxes= seq2seq.model_with_buckets(
                        self.encoder_inputs, self.decoder_inputs, targets,
                        self.target_weights, self.encoder_emotions, self.decoder_emotions, buckets,
                        lambda w, x, y, z, m: seq2seq_f(w, x, y, z, True, m),self.res_logits,
                        softmax_loss_function=softmax_loss_function, use_imemory=use_imemory, use_ememory=use_ememory)
        else:
            self.outputs, self.losses, self.ppxes = seq2seq.model_with_buckets(
                    self.encoder_inputs, self.decoder_inputs, targets,
                    self.target_weights, self.encoder_emotions, self.decoder_emotions, buckets,
                    lambda w, x, y, z, m: seq2seq_f(w, x, y, z, False, m),self.res_logits,
                    softmax_loss_function=softmax_loss_function, use_imemory=use_imemory, use_ememory=use_ememory)

        self.total_losses = (1 * np.array(self.autoEM_losses) + 1 * np.array(self.losses)).tolist()
        # self.total_losses = self.autoEM_losses

            # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in range(len(buckets)):  # len(buckets) is 4 on this occasion
                gradients = tf.gradients(self.total_losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(opt.apply_gradients(
                        zip(clipped_gradients, params), global_step=self.global_step))

        self.pretrain_var = []
        self.initial_var = []
        for i in tf.trainable_variables():
            if 'Emotion' not in i.name and 'emotion' not in i.name and 'memory' not in i.name and 'Memory' not in i.name and "classify" not in i.name and "Attention_0" not in i.name:
                self.pretrain_var.append(i)
        for i in tf.all_variables():
            if i not in self.pretrain_var:
                self.initial_var.append(i)
        self.pretrain_saver = tf.train.Saver(self.pretrain_var, write_version=tf.train.SaverDef.V2)
        self.saver = tf.train.Saver(tf.all_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=400)
コード例 #7
0
ファイル: seq2seq_model.py プロジェクト: zerkh/seq2seq_copy
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
    """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [size, self.target_vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.target_vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                      num_samples,
                                                      self.target_vocab_size)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = rnn_cell.BasicLSTMCell(size)
        cell = single_cell
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                source_vocab_size,
                target_vocab_size,
                output_projection=output_projection,
                feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                self.target_vocab_size,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.nn.xw_plus_b(output, output_projection[0],
                                        output_projection[1])
                        for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                self.target_vocab_size,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables())
コード例 #8
0
    def __init__(
            self,
            source_vocab_size_1,
            source_vocab_size_2,
            target_vocab_size,
            buckets,
            # size, #annotated by yfeng
            hidden_edim,
            hidden_units,  # added by yfeng
            num_layers,
            max_gradient_norm,
            batch_size,
            learning_rate,
            learning_rate_decay_factor,
            beam_size,  # added by shiyue
            constant_emb_en,  # added by al
            constant_emb_fr,  # added by al
            use_lstm=False,
            num_samples=10240,
            forward_only=False):
        """Create the model.

        Args:
          source_vocab_size: size of the source vocabulary.
          target_vocab_size: size of the target vocabulary.
          buckets: a list of pairs (I, O), where I specifies maximum input length
            that will be processed in that bucket, and O specifies maximum output
            length. Training instances that have inputs longer than I or outputs
            longer than O will be pushed to the next bucket and padded accordingly.
            We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
          #size: number of units in each layer of the model.#annotated by yfeng
          hidden_edim: number of dimensions for word embedding
          hidden_units: number of hidden units for each layer
          num_layers: number of layers in the model.
          max_gradient_norm: gradients will be clipped to maximally this norm.
          batch_size: the size of the batches used during training;
            the model construction is independent of batch_size, so it can be
            changed after initialization if this is convenient, e.g., for decoding.
          learning_rate: learning rate to start with.
          learning_rate_decay_factor: decay learning rate by this much when needed.
          use_lstm: if true, we use LSTM cells instead of GRU cells.
          num_samples: number of samples for sampled softmax.
          forward_only: if set, we do not construct the backward pass in the model.
        """
        self.source_vocab_size_1 = source_vocab_size_1
        self.source_vocab_size_2 = source_vocab_size_2
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        # if num_samples > 0 and num_samples < self.target_vocab_size:
        if num_samples > 0:
            # w = tf.get_variable("proj_w", [size, self.target_vocab_size])  #annotated by feng
            w = tf.get_variable("proj_w",
                                [hidden_units // 2, self.target_vocab_size],
                                initializer=tf.random_normal_initializer(
                                    0, 0.01, seed=SEED))  # added by yfeng
            # w_t = tf.transpose(w)
            b = tf.get_variable("proj_b", [self.target_vocab_size],
                                initializer=tf.constant_initializer(0.0),
                                trainable=False)  # added by yfeng
            output_projection = (w, b)

            def sampled_loss(logit, target):
                # labels = tf.reshape(labels, [-1, 1])
                logit = nn_ops.xw_plus_b(logit, output_projection[0],
                                         output_projection[1])
                # return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                #                                   self.target_vocab_size)
                target = array_ops.reshape(target, [-1])
                return nn_ops.sparse_softmax_cross_entropy_with_logits(
                    logit, target)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        # single_cell = tf.nn.rnn_cell.GRUCell(hidden_units) #annotated by yfeng
        single_cell = rnn_cell.GRUCell(hidden_units)  # added by yfeng
        if use_lstm:
            # single_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_units) #annotated by yfeng
            single_cell = rnn_cell.BasicLSTMCell(
                hidden_units)  # added by yfeng
        cell = single_cell
        if num_layers > 1:
            # modified by yfeng
            # cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)
            cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)
            # end by yfeng
        cell = rnn_cell.DropoutWrapper(cell, input_keep_prob=0.8, seed=SEED)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs_1, encoder_inputs_2, encoder_mask_1,
                      encoder_mask_2, decoder_inputs, do_decode):
            # return tf.nn.seq2seq.embedding_attention_seq2seq( #annnotated by yfeng
            return seq2seq_al.embedding_attention_seq2seq(  # added by yfeng
                encoder_inputs_1,
                encoder_inputs_2,
                encoder_mask_1,
                encoder_mask_2,
                decoder_inputs,
                cell,
                num_encoder_symbols_1=source_vocab_size_1,
                num_encoder_symbols_2=source_vocab_size_2,
                num_decoder_symbols=target_vocab_size,
                # embedding_size=size,  #annotated by yfeng
                embedding_size=hidden_edim,  # added by yfeng
                beam_size=beam_size,  # added by shiyue
                constant_emb_en=constant_emb_en,  # added by al
                constant_emb_fr=constant_emb_fr,  # added by al
                output_projection=output_projection,
                feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs_1 = []
        self.encoder_inputs_2 = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs_1.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}_1".format(i)))

        for i in xrange(buckets[-1][1]):  # Last bucket is the biggest one.
            self.encoder_inputs_2.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}_2".format(i)))

        for i in xrange(buckets[-1][2] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))
        self.encoder_mask_1 = tf.placeholder(tf.int32,
                                             shape=[None, None],
                                             name="encoder_mask_1")
        self.encoder_mask_2 = tf.placeholder(tf.int32,
                                             shape=[None, None],
                                             name="encoder_mask_2")

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        if forward_only:
            # self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( #annotated by yfeng
            self.outputs, self.losses, self.symbols = seq2seq_al.model_with_buckets(  # added by yfeng and shiyue
                self.encoder_inputs_1,
                self.encoder_inputs_2,
                self.encoder_mask_1,
                self.encoder_mask_2,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x1, x2, y1, y2, z: seq2seq_f(x1, x2, y1, y2, z, True),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            # annotated by shiyue, when using beam search, no need to do decoding projection
            # if output_projection is not None:
            #     for b in xrange(len(buckets)):
            #         self.outputs[b] = [
            #             tf.matmul(output, output_projection[0]) + output_projection[1]
            #             for output in self.outputs[b]
            #             ]
            # ended by shiyue
        else:
            # self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets(  #annotated by yfeng
            self.outputs, self.losses, self.symbols = seq2seq_al.model_with_buckets(  # added by yfeng and shiyue
                self.encoder_inputs_1,
                self.encoder_inputs_2,
                self.encoder_mask_1,
                self.encoder_mask_2,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x1, x2, y1, y2, z: seq2seq_f(x1, x2, y1, y2, z, False),
                softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params_to_update = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.gradient_norms_print = []
            self.updates = []
            # opt = tf.train.AdadeltaOptimizer(learning_rate=self.learning_rate, rho=0.95, epsilon=1e-6)
            opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            # opt = tf.train.GradientDescentOptimizer(self.learning_rate) #added by yfeng
            for b in xrange(len(buckets)):
                gradients = tf.gradients(
                    self.losses[b],
                    params_to_update,
                    aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)
                # gradients_print = tf.gradients(self.losses[b], params_to_print)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                # _, norm_print = tf.clip_by_global_norm(gradients_print,
                #                                                  max_gradient_norm)
                self.gradient_norms.append(norm)
                # self.gradient_norms_print.append(norm_print)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients,
                                            params_to_update),
                                        global_step=self.global_step))

        # self.saver = tf.train.Saver(tf.all_variables()) #annotated by yfeng
        self.saver = tf.train.Saver(
            tf.all_variables(),
            max_to_keep=1000,
            keep_checkpoint_every_n_hours=6)  # added by yfeng
コード例 #9
0
ファイル: model.py プロジェクト: mattx7/seq2seq_experiments
    def __init__(self,
                 is_training=False,
                 hidden_units=128,
                 num_layers=1,
                 input_sequence_len=20,
                 output_sequence_len=10,
                 num_input_symbols=20,
                 num_output_symbols=20,
                 weight_amplitude=0.08,
                 batch_size=32,
                 peep=False):

        self.encoder_inputs = []
        self.decoder_inputs = []

        for i in range(input_sequence_len):
            self.encoder_inputs.append(tf.placeholder(tf.float32, shape=(None, num_input_symbols),
                                                      name="encoder_{0}".format(i)))

        for i in range(output_sequence_len + 1):
            self.decoder_inputs.append(tf.placeholder(tf.float32, shape=(None, num_output_symbols),
                                                      name="decoder_{0}".format(i)))

        def random_uniform():
            return tf.random_uniform_initializer(-weight_amplitude, weight_amplitude)

        if num_layers > 1:
            cells = [rnn_cell.LSTMCell(hidden_units, use_peepholes=peep, input_size=num_input_symbols,
                                       initializer=random_uniform())]
            cells += [rnn_cell.LSTMCell(hidden_units, use_peepholes=peep, input_size=hidden_units,
                                        initializer=random_uniform()) for _ in range(num_layers - 1)]
            self.cell = rnn_cell.MultiRNNCell(cells)
        else:
            self.cell = rnn_cell.LSTMCell(hidden_units, use_peepholes=peep,
                                          initializer=random_uniform())

        self.w_softmax = tf.get_variable('w_softmax', shape=(hidden_units, num_output_symbols),
                                         initializer=random_uniform())
        self.b_softmax = tf.get_variable('b_softmax', shape=(num_output_symbols,),
                                         initializer=random_uniform())

        # decoder_outputs is a list of tensors with output_sequence_len: [(batch_size x hidden_units)]
        decoder_outputs, _ = self._init_seq2seq(self.encoder_inputs, self.decoder_inputs, self.cell,
                                                feed_previous=not is_training)

        output_logits = [tf.matmul(decoder_output, self.w_softmax) + self.b_softmax
                         for decoder_output in decoder_outputs]
        self.output_probs = [tf.nn.softmax(logit) for logit in output_logits]

        # If this is a training model create the training operation and loss function
        if is_training:
            self.targets = self.decoder_inputs[1:]
            losses = [tf.nn.softmax_cross_entropy_with_logits(logit, target)
                      for logit, target in zip(output_logits, self.targets)]

            loss = tf.reduce_sum(tf.add_n(losses))
            self.cost = loss / output_sequence_len / batch_size
            self.learning_rate = tf.Variable(DEFAULT_LEARNING_RATE, trainable=False)

            train_vars = tf.trainable_variables()
            grads = tf.gradients(self.cost, train_vars)
            optimizer = tf.train.AdamOptimizer(self.learning_rate)

            self.train_op = optimizer.apply_gradients(zip(grads, train_vars))
コード例 #10
0
ファイル: seq2seq_model.py プロジェクト: wenanshi/CSLT_NMT
    def __init__(self, source_vocab_size, target_vocab_size, buckets, hidden_edim, hidden_units,
                 num_layers, keep_prob, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor,
                 beam_size, forward_only=False):
        """Create the model.

        Args:
          source_vocab_size: size of the source vocabulary.
          target_vocab_size: size of the target vocabulary.
          buckets: a list of pairs (I, O), where I specifies maximum input length
            that will be processed in that bucket, and O specifies maximum output
            length. Training instances that have inputs longer than I or outputs
            longer than O will be pushed to the next bucket and padded accordingly.
            We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
          hidden_edim: number of dimensions for word embedding
          hidden_units: number of hidden units for each layer
          num_layers: number of layers in the model.
          keep_prob: keep probability used for dropout.
          max_gradient_norm: gradients will be clipped to maximally this norm.
          batch_size: the size of the batches used during training;
            the model construction is independent of batch_size, so it can be
            changed after initialization if this is convenient, e.g., for decoding.
          learning_rate: learning rate to start with.
          learning_rate_decay_factor: decay learning rate by this much when needed.
          beam_size: the beam size for beam search decoding
          forward_only: if set, we do not construct the backward pass in the model.
        """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        w = tf.get_variable("proj_w", [hidden_units // 2, self.target_vocab_size],
                            initializer=tf.random_normal_initializer(0, 0.01, seed=123))
        b = tf.get_variable("proj_b", [self.target_vocab_size],
                            initializer=tf.constant_initializer(0.0), trainable=False)
        output_projection = (w, b)  # before softmax, there is an output projection

        def softmax_loss_function(logit, target):  # loss function of seq2seq model
            logit = nn_ops.xw_plus_b(logit, output_projection[0], output_projection[1])
            target = array_ops.reshape(target, [-1])
            return nn_ops.sparse_softmax_cross_entropy_with_logits(
                    logit, target)

        single_cell = rnn_cell.GRUCell(hidden_units)
        cell = single_cell
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)
        if not forward_only:
            cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=float(keep_prob), seed=123)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, encoder_mask, decoder_inputs, do_decode):
            return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs, encoder_mask, decoder_inputs, cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=hidden_edim,
                    beam_size=beam_size,
                    output_projection=output_projection,
                    num_layers=num_layers,
                    feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                      name="encoder{0}".format(i)))

        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                      name="decoder{0}".format(i)))
            self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
                                                      name="weight{0}".format(i)))
        self.encoder_mask = tf.placeholder(tf.int32, shape=[None, None],
                                           name="encoder_mask")

        # Our targets are decoder inputs shifted by one.
        targets = [self.decoder_inputs[i + 1]
                   for i in xrange(len(self.decoder_inputs) - 1)]

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses, self.symbols = seq2seq.model_with_buckets(
                    self.encoder_inputs, self.encoder_mask, self.decoder_inputs, targets,
                    self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, True),
                    softmax_loss_function=softmax_loss_function)
        else:
            self.outputs, self.losses, self.symbols = seq2seq.model_with_buckets(
                    self.encoder_inputs, self.encoder_mask, self.decoder_inputs, targets,
                    self.target_weights, buckets,
                    lambda x, y, z: seq2seq_f(x, y, z, False),
                    softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params_to_update = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.gradient_norms_print = []
            self.updates = []
            opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params_to_update,
                                         aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)
                clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                                 max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(opt.apply_gradients(
                        zip(clipped_gradients, params_to_update), global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables(), max_to_keep=1000,  # keep all checkpoints
                                    keep_checkpoint_every_n_hours=6)