def __init__(self,
               source_vocab_size,
               target_vocab_size,
               buckets,
               size,
               num_layers,
               max_gradient_norm,
               batch_size,
               learning_rate,
               learning_rate_decay_factor,
               use_lstm=False,
               num_samples=512,
               forward_only=False,
               scope_name='seq2seq',
               dtype=tf.float32):
    """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
      dtype: the data type to use to store internal variables.
    """
    self.scope_name = scope_name
    with tf.variable_scope(self.scope_name):
      self.source_vocab_size = source_vocab_size
      self.target_vocab_size = target_vocab_size
      self.buckets = buckets
      self.batch_size = batch_size
      self.learning_rate = tf.Variable(
          float(learning_rate), trainable=False, dtype=dtype)
      self.learning_rate_decay_op = self.learning_rate.assign(
          self.learning_rate * learning_rate_decay_factor)
      self.global_step = tf.Variable(0, trainable=False)
      self.dummy_dialogs = [] # [TODO] load dummy sentences 

      # If we use sampled softmax, we need an output projection.
      output_projection = None
      softmax_loss_function = None
      # Sampled softmax only makes sense if we sample less than vocabulary size.
      if num_samples > 0 and num_samples < self.target_vocab_size:
        w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype)
        w = tf.transpose(w_t)
        b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
        output_projection = (w, b)

        def sampled_loss(labels, inputs):
          labels = tf.reshape(labels, [-1, 1])
          # We need to compute the sampled_softmax_loss using 32bit floats to
          # avoid numerical instabilities.
          local_w_t = tf.cast(w_t, tf.float32)
          local_b = tf.cast(b, tf.float32)
          local_inputs = tf.cast(inputs, tf.float32)
          return tf.cast(
              tf.nn.sampled_softmax_loss(
                  weights=local_w_t,
                  biases=local_b,
                  labels=labels,
                  inputs=local_inputs,
                  num_sampled=num_samples,
                  num_classes=self.target_vocab_size),
              dtype)
        softmax_loss_function = sampled_loss

      # Create the internal multi-layer cell for our RNN.
      def single_cell():
        return tf.contrib.rnn.GRUCell(size)
      if use_lstm:
        def single_cell():
          return tf.contrib.rnn.BasicLSTMCell(size)
      cell = single_cell()
      if num_layers > 1:
        cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)])

      # The seq2seq function: we use embedding for the input and attention.
      def seq2seq_f(encoder_inputs, decoder_inputs, feed_previous):
        return tf_seq2seq.embedding_attention_seq2seq(
            encoder_inputs, 
            decoder_inputs, 
            cell,
            num_encoder_symbols=source_vocab_size,
            num_decoder_symbols=target_vocab_size,
            embedding_size=size,
            output_projection=output_projection,
            feed_previous=feed_previous, #do_decode,
            dtype=dtype)

      # Feeds for inputs.
      self.encoder_inputs = []
      self.decoder_inputs = []
      self.target_weights = []
      for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
        self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                  name="encoder{0}".format(i)))
      for i in xrange(buckets[-1][1] + 1):
        self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                  name="decoder{0}".format(i)))
        self.target_weights.append(tf.placeholder(dtype, shape=[None],
                                                  name="weight{0}".format(i)))

      # Our targets are decoder inputs shifted by one.
      targets = [self.decoder_inputs[i + 1]
                 for i in xrange(len(self.decoder_inputs) - 1)]

      # for reinforcement learning
      # self.force_dec_input = tf.placeholder(tf.bool, name="force_dec_input")
      # self.en_output_proj = tf.placeholder(tf.bool, name="en_output_proj")

      # Training outputs and losses.
      if forward_only:
        self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
            self.encoder_inputs, self.decoder_inputs, targets,
            self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
            softmax_loss_function=softmax_loss_function)
        # If we use output projection, we need to project outputs for decoding.
        if output_projection is not None:
          for b in xrange(len(buckets)):
            self.outputs[b] = [
                tf.matmul(output, output_projection[0]) + output_projection[1]
                for output in self.outputs[b]
            ]
      else:
        self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
            self.encoder_inputs, self.decoder_inputs, targets,
            self.target_weights, buckets,
            lambda x, y: seq2seq_f(x, y, False),
            softmax_loss_function=softmax_loss_function)

      # # Training outputs and losses.
      # self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
      #     self.encoder_inputs, self.decoder_inputs, targets,
      #     self.target_weights, buckets, 
      #     lambda x, y: seq2seq_f(x, y, tf.where(self.force_dec_input, False, True)),
      #     softmax_loss_function=softmax_loss_function
      #   )
      #   # If we use output projection, we need to project outputs for decoding.
      #   # if output_projection is not None:
      # for b in xrange(len(buckets)):
      #   self.outputs[b] = [
      #       control_flow_ops.cond(
      #         self.en_output_proj,
      #         lambda: tf.matmul(output, output_projection[0]) + output_projection[1],
      #         lambda: output
      #       )
      #       for output in self.outputs[b]
      #   ]
        
      # Gradients and SGD update operation for training the model.
      params = tf.trainable_variables()
      # if not forward_only:
      self.gradient_norms = []
      self.updates = []
      self.advantage = [tf.placeholder(tf.float32, name="advantage_%i" % i) for i in xrange(len(buckets))]
      opt = tf.train.GradientDescentOptimizer(self.learning_rate)
      for b in xrange(len(buckets)):
        # self.losses[b] = tf.subtract(self.losses[b], self.advantage[b])
        gradients = tf.gradients(self.losses[b], params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                         max_gradient_norm)
        self.gradient_norms.append(norm)
        self.updates.append(opt.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step))

      all_variables = tf.global_variables()
      all_variables = [k for k in tf.global_variables() if k.name.startswith(self.scope_name)]
      self.saver = tf.train.Saver(all_variables)
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False,
                 scope_name='seq2seq',
                 dtype=tf.float32):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
      dtype: the data type to use to store internal variables.
    """
        self.scope_name = scope_name
        with tf.variable_scope(self.scope_name):
            self.source_vocab_size = source_vocab_size
            self.target_vocab_size = target_vocab_size
            self.buckets = buckets
            self.batch_size = batch_size
            self.learning_rate = tf.Variable(float(learning_rate),
                                             trainable=False,
                                             dtype=dtype)
            self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * learning_rate_decay_factor)
            self.global_step = tf.Variable(0, trainable=False)
            self.dummy_dialogs = []
            #self.dummy_dialogs = [["what", "?"], ["No","comment"], ["i", "'", "m", "sorry"], ["i", "'", "m", "not"], ["yeah."], ["okay", "."], ["why"], ["no", "."], ["yes"], ["fine"], ["yes"], ["sure"]]# [TODO] load dummy sentences
            #encode dummies
            #self.dummy_dialogs = [str.encode(dum) for dum in self.dummy_dialogs]
            # If we use sampled softmax, we need an output projection.
            output_projection = None
            softmax_loss_function = None
            # Sampled softmax only makes sense if we sample less than vocabulary size.
            if num_samples > 0 and num_samples < self.target_vocab_size:
                w_t = tf.get_variable("proj_w", [self.target_vocab_size, size],
                                      dtype=dtype)
                w = tf.transpose(w_t)
                b = tf.get_variable("proj_b", [self.target_vocab_size],
                                    dtype=dtype)
                output_projection = (w, b)

                def sampled_loss(labels, inputs):
                    labels = tf.reshape(labels, [-1, 1])
                    # We need to compute the sampled_softmax_loss using 32bit floats to
                    # avoid numerical instabilities.
                    local_w_t = tf.cast(w_t, tf.float32)
                    local_b = tf.cast(b, tf.float32)
                    local_inputs = tf.cast(inputs, tf.float32)
                    return tf.cast(
                        tf.nn.sampled_softmax_loss(
                            weights=local_w_t,
                            biases=local_b,
                            labels=labels,
                            inputs=local_inputs,
                            num_sampled=num_samples,
                            num_classes=self.target_vocab_size), dtype)

                softmax_loss_function = sampled_loss

            # Create the internal multi-layer cell for our RNN.
            def single_cell():
                return tf.contrib.rnn.GRUCell(size)

            if use_lstm:

                def single_cell():
                    return tf.contrib.rnn.BasicLSTMCell(size)

            cell = single_cell()
            if num_layers > 1:
                cell = tf.contrib.rnn.MultiRNNCell(
                    [single_cell() for _ in range(num_layers)])

            #load training corpus
            def get_gold(workspace=args.workspace):
                data_dir = "%s/data" % (workspace)
                full_path = str(
                    sys.path[-1]) + "/" + data_dir + "/train/chat.txt.gz"
                print(full_path)
                with gzip.open(full_path, 'rb') as zi:
                    test_sentences = zi.read()
                    test_sentences = test_sentences.split("\n")
                    zi.close()
                return test_sentences

            # The seq2seq function: we use embedding for the input and attention.
            def seq2seq_f(encoder_inputs, decoder_inputs, feed_previous):
                return tf_seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=size,
                    output_projection=output_projection,
                    feed_previous=feed_previous,  #do_decode,
                    dtype=dtype)

            # Feeds for inputs.
            self.encoder_inputs = []
            self.decoder_inputs = []
            self.target_weights = []
            for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
                self.encoder_inputs.append(
                    tf.placeholder(tf.int32,
                                   shape=[None],
                                   name="encoder{0}".format(i)))
            for i in xrange(buckets[-1][1] + 1):
                self.decoder_inputs.append(
                    tf.placeholder(tf.int32,
                                   shape=[None],
                                   name="decoder{0}".format(i)))
                self.target_weights.append(
                    tf.placeholder(dtype,
                                   shape=[None],
                                   name="weight{0}".format(i)))

            # Our targets are decoder inputs shifted by one.
            targets = [
                self.decoder_inputs[i + 1]
                for i in xrange(len(self.decoder_inputs) - 1)
            ]

            # for reinforcement learning
            self.force_dec_input = tf.placeholder(tf.bool,
                                                  name="force_dec_input")
            self.en_output_proj = tf.placeholder(tf.bool,
                                                 name="en_output_proj")

            # Training outputs and losses.
            if forward_only:
                self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    buckets,
                    lambda x, y: seq2seq_f(x, y, True),
                    softmax_loss_function=softmax_loss_function)
                # If we use output projection, we need to project outputs for decoding.
                if output_projection is not None:
                    for b in xrange(len(buckets)):
                        self.outputs[b] = [
                            tf.matmul(output, output_projection[0]) +
                            output_projection[1] for output in self.outputs[b]
                        ]
            else:
                self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    buckets,
                    lambda x, y: seq2seq_f(x, y, False),
                    softmax_loss_function=softmax_loss_function)

            # # Training outputs and losses.
            # self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
            #     self.encoder_inputs, self.decoder_inputs, targets,
            #     self.target_weights, buckets,
            #     lambda x, y: seq2seq_f(x, y, tf.where(self.force_dec_input, False, True)),
            #     softmax_loss_function=softmax_loss_function
            #   )
            #   # If we use output projection, we need to project outputs for decoding.
            #   # if output_projection is not None:
            # for b in xrange(len(buckets)):
            #   self.outputs[b] = [
            #       control_flow_ops.cond(
            #         self.en_output_proj,
            #         lambda: tf.matmul(output, output_projection[0]) + output_projection[1],
            #         lambda: output
            #       )
            #       for output in self.outputs[b]
            #   ]

            # Gradients and SGD update operation for training the model.
            params = tf.trainable_variables()
            # if not forward_only:
            self.gradient_norms = []
            self.updates = []
            self.advantage = [
                tf.placeholder(tf.float32, name="advantage_%i" % i)
                for i in xrange(len(buckets))
            ]
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                self.losses[b] = tf.subtract(self.losses[b], self.advantage[b])
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

            all_variables = tf.global_variables()
            all_variables = [
                k for k in tf.global_variables()
                if k.name.startswith(self.scope_name)
            ]
            print("updating new weights...................")
            self.saver = tf.train.Saver(all_variables)
            data_ = get_gold()
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 scope_name='seq2seq',
                 dtype=tf.float32):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      # forward_only: if set, we do not construct the backward pass in the model.
      dtype: the data type to use to store internal variables.
    """
        self.scope_name = scope_name
        with tf.variable_scope(self.scope_name):
            self.source_vocab_size = source_vocab_size
            self.target_vocab_size = target_vocab_size
            self.buckets = buckets
            self.batch_size = batch_size
            self.learning_rate = tf.Variable(float(learning_rate),
                                             trainable=False,
                                             dtype=dtype)
            self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * learning_rate_decay_factor)
            self.global_step = tf.Variable(0, trainable=False)
            self.dummy_dialogs = []  # [TODO] load dummy sentences

            # If we use sampled softmax, we need an output projection.
            output_projection = None
            softmax_loss_function = None
            # Sampled softmax only makes sense if we sample less than vocabulary size.
            if num_samples > 0 and num_samples < self.target_vocab_size:
                w = tf.get_variable("proj_w", [size, self.target_vocab_size],
                                    dtype=dtype)
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.target_vocab_size],
                                    dtype=dtype)
                output_projection = (w, b)

                def sampled_loss(inputs, labels):
                    labels = tf.reshape(labels, [-1, 1])
                    # We need to compute the sampled_softmax_loss using 32bit floats to
                    # avoid numerical instabilities.
                    local_w_t = tf.cast(w_t, tf.float32)
                    local_b = tf.cast(b, tf.float32)
                    local_inputs = tf.cast(inputs, tf.float32)
                    return tf.cast(
                        tf.nn.sampled_softmax_loss(local_w_t, local_b,
                                                   local_inputs, labels,
                                                   num_samples,
                                                   self.target_vocab_size),
                        dtype)

                softmax_loss_function = sampled_loss

            # Create the internal multi-layer cell for our RNN.
            single_cell = tf.nn.rnn_cell.GRUCell(size)
            if use_lstm:
                single_cell = tf.nn.rnn_cell.BasicLSTMCell(size)
            cell = single_cell
            if num_layers > 1:
                cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)

            # The seq2seq function: we use embedding for the input and attention.
            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return tf_seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=size,
                    output_projection=output_projection,
                    feed_previous=do_decode,
                    dtype=dtype)

            # Feeds for inputs.
            self.encoder_inputs = []
            self.decoder_inputs = []
            self.target_weights = []
            for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
                self.encoder_inputs.append(
                    tf.placeholder(tf.int32,
                                   shape=[batch_size],
                                   name="encoder{0}".format(i)))
            for i in xrange(buckets[-1][1] + 1):
                self.decoder_inputs.append(
                    tf.placeholder(tf.int32,
                                   shape=[batch_size],
                                   name="decoder{0}".format(i)))
                self.target_weights.append(
                    tf.placeholder(dtype,
                                   shape=[batch_size],
                                   name="weight{0}".format(i)))

            # Our targets are decoder inputs shifted by one.
            targets = [
                self.decoder_inputs[i + 1]
                for i in xrange(len(self.decoder_inputs) - 1)
            ]

            # for reinforcement learning
            self.force_dec_input = tf.placeholder(tf.bool,
                                                  name="force_dec_input")
            self.en_output_proj = tf.placeholder(tf.bool,
                                                 name="en_output_proj")

            # Training outputs and losses.
            # if forward_only:  # testing or reinforcement training
            self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(
                    x, y, tf.select(self.force_dec_input, False, True)),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            # if output_projection is not None:
            self.projected_outputs = {}
            for b in xrange(len(buckets)):
                self.outputs[b] = [
                    control_flow_ops.cond(
                        self.en_output_proj,
                        lambda: tf.matmul(output, output_projection[
                            0]) + output_projection[1], lambda: output)
                    for output in self.outputs[b]
                ]
            # else: # normal training
            #   self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
            #       self.encoder_inputs, self.decoder_inputs, targets,
            #       self.target_weights, buckets,
            #       lambda x, y: seq2seq_f(x, y, False),
            #       softmax_loss_function=softmax_loss_function)

            # Gradients and SGD update operation for training the model.
            params = tf.trainable_variables()
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

            # self.saver = tf.train.Saver(tf.all_variables())
            all_variables = [
                k for k in tf.all_variables()
                if k.name.startswith(self.scope_name)
            ]
            self.saver = tf.train.Saver(all_variables)