Example #1
0
	def add_optimizer(self, global_step):
		'''Adds optimizer to the graph. Supposes that initialize function has already been called.
		'''
		with tf.variable_scope('optimizer'):
			hp = self._hparams

			#Adam with constant learning rate
			optimizer = tf.train.AdamOptimizer(hp.wavenet_learning_rate, hp.wavenet_adam_beta1,
				hp.wavenet_adam_beta2, hp.wavenet_adam_epsilon)

			gradients, variables = zip(*optimizer.compute_gradients(self.loss))
			self.gradients = gradients

			#Gradients clipping
			clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.)

			with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
				adam_optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
					global_step=global_step)

		#Add exponential moving average
		#https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
		#Use adam optimization process as a dependency
		with tf.control_dependencies([adam_optimize]):
			#Create the shadow variables and add ops to maintain moving averages
			#Also updates moving averages after each update step
			#This is the optimize call instead of traditional adam_optimize one.
			assert tuple(self.variables) == variables #Verify all trainable variables are being averaged
			self.optimize = self.ema.apply(variables)
Example #2
0
  def add_training_ops(self, learning_rate: bool = 1e-3, learning_rate_decay_factor: float = 0,
                       max_gradient_norm: float = 5.0, momentum: float = 0.9):
    """
    Add the ops for training

    Args:
      learning_rate: the inital learning rate
      learning_rate_decay_factor: the factor to multiple the learning rate with when it should be decreased
      max_gradient_norm: the maximum gradient norm to apply, otherwise clipping is applied
      momentum: the momentum parameter
    """

    self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32, name='learning_rate')
    self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor)

    # Variable summaries
    tf.summary.scalar('learning_rate', self.learning_rate)

    # Define loss and optimizer
    if self.labels is not None:
      with tf.name_scope('training'):
        self.cost = tf.nn.ctc_loss(self.labels, self.logits, self.sequence_lengths // 2)
        self.avg_loss = tf.reduce_mean(self.cost, name='average_loss')
        tf.summary.scalar('loss', self.avg_loss)
        optimizer = tf.train.AdamOptimizer(self.learning_rate, epsilon=1e-3)
        gvs = optimizer.compute_gradients(self.avg_loss)
        gradients, trainables = zip(*gvs)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm, name='clip_gradients')
        self.update = optimizer.apply_gradients(zip(clipped_gradients, trainables),
                                                global_step=self.global_step, name='apply_gradients')
Example #3
0
    def __init__(self, config):
        sent_len = config.sent_len
        batch_size = config.batch_size
        vocab_size = config.vocab_size
        embed_size = config.embed_size
        num_layers = config.num_layers
        state_size = config.state_size
        keep_prob = config.keep_prob

        self.input_data = tf.placeholder(tf.int32, [batch_size, sent_len])
        self.lengths = tf.placeholder(tf.int64, [batch_size])
        self.targets = tf.placeholder(tf.float32, [batch_size, 1])

        # Get embedding layer which requires CPU
        with tf.device("/cpu:0"):
            embeding = tf.get_variable("embeding", [vocab_size, embed_size])
            inputs = tf.nn.embedding_lookup(embeding, self.input_data)

        #LSTM 1 -> Encode the characters of every tok into a fixed dense representation
        with tf.variable_scope("rnn1", reuse=None):
            cell = rnn_cell.LSTMCell(state_size, input_size=embed_size, initializer=tf.contrib.layers.xavier_initializer())
            back_cell = rnn_cell.LSTMCell(state_size, input_size=embed_size, initializer=tf.contrib.layers.xavier_initializer())
            cell = rnn_cell.DropoutWrapper(
              cell, input_keep_prob=keep_prob,
                         output_keep_prob=keep_prob)
            back_cell = rnn_cell.DropoutWrapper(
              back_cell, input_keep_prob=keep_prob,
                              output_keep_prob=keep_prob) 
            cell = rnn_cell.MultiRNNCell([cell] * num_layers)
            backcell = rnn_cell.MultiRNNCell([back_cell] * num_layers)
            
            rnn_splits = [tf.squeeze(input_, [1]) for input_ in tf.split(1, sent_len, inputs)]

            # Run the bidirectional rnn
            outputs, last_fw_state, last_bw_state = rnn.bidirectional_rnn(
                                                        cell, backcell, rnn_splits,
                                                        sequence_length=self.lengths,
                                                        dtype=tf.float32)
        
        sent_out = tf.concat(1, [last_fw_state, last_bw_state])
        #sent_out = outputs[-1]
        #sent_out = tf.add_n(outputs)
        output_size = state_size*4

        with tf.variable_scope("linear", reuse=None):
            w = tf.get_variable("w", [output_size, 1])
            b = tf.get_variable("b", [1], initializer=tf.constant_initializer(0.0))
            raw_logits = tf.matmul(sent_out, w) + b 
        self.probabilities = tf.sigmoid(raw_logits)
        self.cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(raw_logits, self.targets))

        #Calculate gradients and propagate
        #Aggregation method 2 is really important for rnn per the tensorflow issues list
        tvars = tf.trainable_variables()
        self.lr = tf.Variable(0.0, trainable=False) #Assign to overwrite
        optimizer = tf.train.AdamOptimizer()
        grads, _vars = zip(*optimizer.compute_gradients(self.cost, tvars, aggregation_method=2))
        grads, self.grad_norm = tf.clip_by_global_norm(grads,
                                      config.max_grad_norm)
        self.train_op = optimizer.apply_gradients(zip(grads, _vars))
Example #4
0
  def _add_shared_train_op(self):
    """Sets self._train_op, the op to run for training."""
    # Take gradients of the trainable variables w.r.t. the loss function to minimize
    if self._hps.rl_training or self._hps.ac_training:
      loss_to_minimize = self._reinforce_shared_loss
      if self._hps.coverage:
        loss_to_minimize = self._reinforce_cov_total_loss
    else:
      loss_to_minimize = self._pgen_loss
      if self._hps.coverage:
        loss_to_minimize = self._pointer_cov_total_loss

    tvars = tf.trainable_variables()
    gradients = tf.gradients(loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)

    # Clip the gradients
    with tf.device("/gpu:{}".format(self._hps.gpu_num)):
      grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm)

    # Add a summary
    tf.summary.scalar('global_norm', global_norm)

    # Apply adagrad optimizer
    optimizer = tf.train.AdagradOptimizer(self._hps.lr, initial_accumulator_value=self._hps.adagrad_init_acc)
    with tf.device("/gpu:{}".format(self._hps.gpu_num)):
      self._shared_train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step')
    def __init__(self, is_training, config):
        self.batch_size = batch_size = config.batch_size
        size = config.hidden_size
        self.max_len = max_len = config.max_len
        vocab_size = config.vocab_size

        self._input_data = tf.placeholder(tf.int32, [batch_size, config.max_len])
        self._targets = tf.placeholder(tf.int32, [batch_size])

        embedding = tf.get_variable("embedding", [vocab_size, size])
        inputs = tf.nn.embedding_lookup(embedding, self._input_data)

        output = tf.reduce_sum(inputs, 1)
        softmax_w = tf.get_variable("softmax_w", [size, 2])
        softmax_b = tf.get_variable("softmax_b", [2])
        
        logits = tf.matmul(output, softmax_w) + softmax_b
        prediction = tf.nn.softmax(logits)
        self._prediction = prediction

        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, self._targets)
        
        self._cost = cost = tf.reduce_sum(loss) / batch_size

        if not is_training:
            return

        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))
Example #6
0
    def __init__(self):
        self.batchsize = 32
        self.vocabsize = (10 * 1000) + 2
        self.word_embed_size = 300
        self.sentence_length = 30
        self.dropout_prob = 10
        self.num_layers = 1
        self.decoder_hidden_size = 500
        self.max_gradient_norm = 5.0
        self.sentence_embed_size = 500

        self.sentences_in = tf.placeholder(tf.int32, [self.batchsize, self.sentence_length])
        self.sentences_in_decoded = tf.placeholder(tf.int32, [self.batchsize, self.sentence_length])
        self.d = self.decoder()

        flat_in = tf.reshape(self.sentences_in, [self.batchsize * self.sentence_length,1])
        flat_d = tf.reshape(self.d, [self.batchsize * self.sentence_length, self.decoder_hidden_size])
        cross_entropy = tf.nn.sampled_softmax_loss(tf.transpose(self.d_w2), self.d_b2, flat_d, flat_in, 512, self.vocabsize)
        self.generation_loss = tf.reduce_sum(tf.reshape(cross_entropy, [self.batchsize, self.sentence_length]), reduction_indices=1)
        self.cost = tf.reduce_mean(self.generation_loss)

        params = tf.trainable_variables()
        gradients = tf.gradients(self.cost, params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
        self.optim = tf.train.AdamOptimizer(0.0001)
        self.update = self.optim.apply_gradients(zip(clipped_gradients, params))

        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())
 def optimizer(someloss):
     global_step = tf.Variable(0)
     optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
     gradients, v = zip(*optimizer.compute_gradients(someloss))
     gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
     optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
     return optimizer
Example #8
0
def create_critic_train_op(hparams, critic_loss, global_step):
  """Create Discriminator train op."""
  with tf.name_scope('train_critic'):
    critic_optimizer = tf.train.AdamOptimizer(hparams.critic_learning_rate)
    output_vars = [
        v for v in tf.trainable_variables() if v.op.name.startswith('critic')
    ]

    if FLAGS.critic_update_dis_vars:
      if FLAGS.discriminator_model == 'bidirectional_vd':
        critic_vars = [
            v for v in tf.trainable_variables()
            if v.op.name.startswith('dis/rnn')
        ]
      elif FLAGS.discriminator_model == 'seq2seq_vd':
        critic_vars = [
            v for v in tf.trainable_variables()
            if v.op.name.startswith('dis/decoder/rnn/multi_rnn_cell')
        ]
      critic_vars.extend(output_vars)
    else:
      critic_vars = output_vars
    print('\nOptimizing Critic vars:')
    for v in critic_vars:
      print(v)
    critic_grads = tf.gradients(critic_loss, critic_vars)
    critic_grads_clipped, _ = tf.clip_by_global_norm(critic_grads,
                                                     FLAGS.grad_clipping)
    critic_train_op = critic_optimizer.apply_gradients(
        zip(critic_grads_clipped, critic_vars), global_step=global_step)
    return critic_train_op, critic_grads_clipped, critic_vars
Example #9
0
def create_gen_train_op(hparams, learning_rate, gen_loss, global_step, mode):
  """Create Generator train op."""
  del hparams
  with tf.name_scope('train_generator'):
    if FLAGS.generator_optimizer == 'sgd':
      gen_optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    elif FLAGS.generator_optimizer == 'adam':
      gen_optimizer = tf.train.AdamOptimizer(learning_rate)
    else:
      raise NotImplementedError
    gen_vars = [
        v for v in tf.trainable_variables() if v.op.name.startswith('gen')
    ]
    print('Optimizing Generator vars.')
    for v in gen_vars:
      print(v)
    if mode == 'MINIMIZE':
      gen_grads = tf.gradients(gen_loss, gen_vars)
    elif mode == 'MAXIMIZE':
      gen_grads = tf.gradients(-gen_loss, gen_vars)
    else:
      raise ValueError("Must be one of 'MINIMIZE' or 'MAXIMIZE'")
    gen_grads_clipped, _ = tf.clip_by_global_norm(gen_grads,
                                                  FLAGS.grad_clipping)
    gen_train_op = gen_optimizer.apply_gradients(
        zip(gen_grads_clipped, gen_vars), global_step=global_step)
    return gen_train_op, gen_grads_clipped, gen_vars
Example #10
0
def create_optimizer(cost,learning_rate):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    grad_clip = 5.
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)
    train_step = optimizer.apply_gradients(zip(grads, tvars))
    return train_step
Example #11
0
    def __init__(self, loss, global_step, optimizer, learning_rate, clip_gradients=5.0):
        """Build a trainer part of graph.

        Args:
          loss: Tensor that evaluates to model's loss.
          global_step: Tensor with global step of the model.
          optimizer: Name of the optimizer class (SGD, Adam, Adagrad) or class.
        """
        self.loss = loss
        self.global_step = global_step
        self._learning_rate = tf.get_variable(
            "learning_rate",
            [],
            initializer=tf.constant_initializer(learning_rate))
        params = tf.trainable_variables()
        self.gradients = tf.gradients(loss, params)
        if clip_gradients > 0.0:
            self.gradients, self.gradients_norm = tf.clip_by_global_norm(
                self.gradients, clip_gradients)
        grads_and_vars = zip(self.gradients, params)
        if isinstance(optimizer, str):
            self._optimizer = OPTIMIZER_CLS_NAMES[
                optimizer](self._learning_rate)
        else:
            self._optimizer = optimizer(self.learning_rate)
        self.trainer = self._optimizer.apply_gradients(grads_and_vars,
                                                       global_step=global_step,
                                                       name="train")
        # Get all initializers for all trainable variables.
        self._initializers = tf.initialize_all_variables()
Example #12
0
    def build_training(self):
        print('  Building training')
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)

        # Do gradient clipping
        # NOTE: this is the correct, but slower clipping by global norm.
        # Maybe it's worth trying the faster tf.clip_by_norm()
        # (See the documentation for tf.clip_by_global_norm() for more info)
        grads_and_vars = optimizer.compute_gradients(self.loss)
        gradients, variables = zip(*grads_and_vars)  # unzip list of tuples
        clipped_gradients, global_norm = (
                tf.clip_by_global_norm(gradients, self.clip_norm) )
        clipped_grads_and_vars = zip(clipped_gradients, variables)

        # Create TensorBoard scalar summary for global gradient norm
        tf.scalar_summary('train/global gradient norm', global_norm)

        # Create TensorBoard summaries for gradients
        # for grad, var in grads_and_vars:
        #     # Sparse tensor updates can't be summarized, so avoid doing that:
        #     if isinstance(grad, tf.Tensor):
        #         tf.histogram_summary('grad_' + var.name, grad)

        # make training op for applying the gradients
        self.train_op = optimizer.apply_gradients(clipped_grads_and_vars,
                                                  global_step=self.global_step)
Example #13
0
	def build_rmsprop_optimizer(self, learning_rate, rmsprop_decay, rmsprop_constant, gradient_clip, version):

		with tf.name_scope('rmsprop'):
			optimizer = tf.train.GradientDescentOptimizer(learning_rate)

			grads_and_vars = optimizer.compute_gradients(self.loss)
			grads = [gv[0] for gv in grads_and_vars]
			params = [gv[1] for gv in grads_and_vars]

			if gradient_clip > 0:
				grads = tf.clip_by_global_norm(grads, gradient_clip)

			if version == 'rmsprop':
				return optimizer.apply_gradients(zip(grads, params))
			elif version == 'graves_rmsprop':
				square_grads = [tf.square(grad) for grad in grads]

				avg_grads = [tf.Variable(tf.ones(var.get_shape())) for var in params]
				avg_square_grads = [tf.Variable(tf.ones(var.get_shape())) for var in params]

				update_avg_grads = [grad_pair[0].assign((rmsprop_decay * grad_pair[0]) + ((1 - rmsprop_decay) * grad_pair[1])) 
					for grad_pair in zip(avg_grads, grads)]
				update_avg_square_grads = [grad_pair[0].assign((rmsprop_decay * grad_pair[0]) + ((1 - rmsprop_decay) * tf.square(grad_pair[1]))) 
					for grad_pair in zip(avg_square_grads, grads)]
				avg_grad_updates = update_avg_grads + update_avg_square_grads

				rms = [tf.sqrt(avg_grad_pair[1] - tf.square(avg_grad_pair[0]) + rmsprop_constant)
					for avg_grad_pair in zip(avg_grads, avg_square_grads)]


				rms_updates = [grad_rms_pair[0] / grad_rms_pair[1] for grad_rms_pair in zip(grads, rms)]
				train = optimizer.apply_gradients(zip(rms_updates, params))

				return tf.group(train, tf.group(*avg_grad_updates))
Example #14
0
def train_neural_network():
    logits, last_state, _, _, _ = neural_network()
    targets = tf.reshape(output_targets, [-1])
    loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)],
                                                  len(words))
    cost = tf.reduce_mean(loss)#arvrage值
    learning_rate = tf.Variable(0.0, trainable=False)
    tvars = tf.trainable_variables()
    #当在一次迭代中权重的更新过于迅猛的话,很容易导致loss divergence。Gradient Clipping的直观作用就是让权重的更新限制在一个合适的范围
    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    train_op = optimizer.apply_gradients(zip(grads, tvars))

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        saver = tf.train.Saver(tf.global_variables())

        for epoch in range(50):
            sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch)))
            n = 0
            for batche in range(n_chunk):
                train_loss, _, _ = sess.run([cost, last_state, train_op],
                                            feed_dict={input_data: x_batches[n], output_targets: y_batches[n]})
                n += 1
                print(epoch, batche, train_loss)
            if epoch % 7 == 0:
                saver.save(sess, 'poetry.module', global_step=epoch)
Example #15
0
  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """Applying gradients and tune hyperparams with YellowFin.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name:  Optional name for the returned operation. Default to the
        name passed to the Optimizer constructor.

    Returns:
        (A group of operations)
        Variable Update with Momentum ops,
        YellowFin ops(Curvature, Variance, Distance) ops,
        SingleStep and lr_mu tuning ops,
        Step increment ops.
    """
    self._grad, self._vars = zip(*[(g, t)
                                   for g, t in grads_and_vars if g is not None])

    # Var update with Momentum.
    with tf.variable_scope("apply_updates"):
      # Gradient Clipping?
      if self._clip_thresh_var is not None:
        self._grad, _ = tf.clip_by_global_norm(
            self._grad, self._clip_thresh_var)

        apply_grad_op = self._momentum_optimizer.apply_gradients(
            zip(self._grad, self._vars),
            global_step=global_step,
            name=name)
      else:
        apply_grad_op = self._momentum_optimizer.apply_gradients(
            zip(self._grad, self._vars),
            global_step=global_step,
            name=name)

    # Begin lr and mu tuning.
    with tf.variable_scope("prepare_yellowFin_variables"):
      # the dependencies ideally only need to be after clip is done,
      # i.e. depends on self._grads. However, the control_dependencies
      # does not support indexed slice for sparse gradients.
      # The alternative dependencies here might be slightly slower due
      # to less parallelization.
      with tf.control_dependencies([apply_grad_op,]):
        prepare_variables_op = self._prepare_variables()

    with tf.variable_scope("yellowfin"):
      with tf.control_dependencies([prepare_variables_op]):
        yellowfin_op = self._yellowfin()

    # Update YellowFin step variable.
    with tf.control_dependencies([yellowfin_op]):
      self._increment_step_op = tf.assign_add(self._step, 1).op

    return tf.group(apply_grad_op,
                    prepare_variables_op,
                    yellowfin_op,
                    self._increment_step_op)
Example #16
0
def clip_by_global_norm_summary(t_list, clip_norm, norm_name, variables):
    # wrapper around tf.clip_by_global_norm that also does summary ops of norms

    # compute norms
    # use global_norm with one element to handle IndexedSlices vs dense
    norms = [tf.global_norm([t]) for t in t_list]

    # summary ops before clipping
    summary_ops = []
    for ns, v in zip(norms, variables):
        name = 'norm_pre_clip/' + v.name.replace(":", "_")
        summary_ops.append(tf.summary.scalar(name, ns))

    # clip
    clipped_t_list, tf_norm = tf.clip_by_global_norm(t_list, clip_norm)

    # summary ops after clipping
    norms_post = [tf.global_norm([t]) for t in clipped_t_list]
    for ns, v in zip(norms_post, variables):
        name = 'norm_post_clip/' + v.name.replace(":", "_")
        summary_ops.append(tf.summary.scalar(name, ns))

    summary_ops.append(tf.summary.scalar(norm_name, tf_norm))

    return clipped_t_list, tf_norm, summary_ops
Example #17
0
 def training_ops(self, loss):
   opt = self.get_optimizer()
   params = tf.trainable_variables()
   gradients = tf.gradients(loss, params)
   clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
   return opt.apply_gradients(zip(clipped_gradients, params),
                              global_step=self.global_step)
Example #18
0
def train_op(loss, global_step, hparams):
  """Uses a gradient descent optimizer to minimize loss.

  Gradient descent is applied to the loss function with an exponentially
  decreasing learning rate.

  Args:
    loss: loss tensor to minimize.
    global_step: A tf.Variable of type int holding the global training step.
    hparams: HParams instance containing model hyperparameters.

  Returns:
    training_op: An op that performs weight updates on the model.
    learning_rate: An op that decays learning rate, if that option is set in
        `hparams`.
  """
  if hparams.exponentially_decay_learning_rate:
    learning_rate = tf.train.exponential_decay(hparams.initial_learning_rate,
                                               global_step,
                                               hparams.decay_steps,
                                               hparams.decay_rate,
                                               staircase=True,
                                               name='learning_rate')
  else:
    learning_rate = tf.Variable(hparams.initial_learning_rate, trainable=False)
  opt = tf.train.AdagradOptimizer(learning_rate)
  params = tf.trainable_variables()
  gradients = tf.gradients(loss, params)
  clipped_gradients, _ = tf.clip_by_global_norm(gradients, hparams.clip_norm)
  training_op = opt.apply_gradients(zip(clipped_gradients, params),
                                    global_step=global_step)

  return training_op, learning_rate
Example #19
0
def make_train_op(loss, ema_decay=None, prefix=None):
    optimizer = COCOB()
    glob_step = tf.train.get_global_step()

    # Add regularization losses
    reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    total_loss = loss + reg_losses if reg_losses else loss

    # Clip gradients
    grads_and_vars = optimizer.compute_gradients(total_loss)
    gradients, variables = zip(*grads_and_vars)
    clipped_gradients, glob_norm = tf.clip_by_global_norm(gradients, GRAD_CLIP_THRESHOLD)
    sgd_op, glob_norm = optimizer.apply_gradients(zip(clipped_gradients, variables)), glob_norm

    # Apply SGD averaging
    if ema_decay:
        ema = tf.train.ExponentialMovingAverage(decay=ema_decay, num_updates=glob_step)
        if prefix:
            # Some magic to handle multiple models trained in single graph
            ema_vars = [var for var in variables if var.name.startswith(prefix)]
        else:
            ema_vars = variables
        update_ema = ema.apply(ema_vars)
        with tf.control_dependencies([sgd_op]):
            training_op = tf.group(update_ema)
    else:
        training_op = sgd_op
        ema = None
    return training_op, glob_norm, ema
Example #20
0
    def _update_network(self, trainer):
        self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
        self.actions_onehot = tf.one_hot(
            self.actions, self.a_dim, dtype=tf.float32)
        self.target_v = tf.placeholder(shape=[None], dtype=tf.float32)
        self.advantages = tf.placeholder(shape=[None], dtype=tf.float32)

        self.outputs = tf.reduce_sum(
                self.policy * self.actions_onehot, [1])

        # loss
        self.value_loss = 0.5 * tf.reduce_sum(tf.square(
                self.target_v - tf.reshape(self.value, [-1])))
        # higher entropy -> lower loss -> encourage exploration
        self.entropy = -tf.reduce_sum(self.policy * tf.log(self.policy))

        self.policy_loss = -tf.reduce_sum(
            tf.log(self.outputs) * self.advantages)

        self.loss = 0.5 * self.value_loss \
            + self.policy_loss - 0.01 * self.entropy

        # local gradients
        local_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
        self.gradients = tf.gradients(self.loss, local_vars)
        self.var_norms = tf.global_norm(local_vars)

        # grads[i] * clip_norm / max(global_norm, clip_norm)
        grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, 40.0)

        # apply gradients to global network
        global_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
        self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))
Example #21
0
    def add_train_op(self, lr_method, lr, loss, clip=-1):
        """Defines self.train_op that performs an update on a batch

        Args:
            lr_method: (string) sgd method, for example "adam"
            lr: (tf.placeholder) tf.float32, learning rate
            loss: (tensor) tf.float32 loss to minimize
            clip: (python float) clipping of gradient. If < 0, no clipping

        """
        _lr_m = lr_method.lower() # lower to make sure

        with tf.variable_scope("train_step"):
            if _lr_m == 'adam': # sgd method
                optimizer = tf.train.AdamOptimizer(lr)
            elif _lr_m == 'adagrad':
                optimizer = tf.train.AdagradOptimizer(lr)
            elif _lr_m == 'sgd':
                optimizer = tf.train.GradientDescentOptimizer(lr)
            elif _lr_m == 'rmsprop':
                optimizer = tf.train.RMSPropOptimizer(lr)
            else:
                raise NotImplementedError("Unknown method {}".format(_lr_m))

            if clip > 0: # gradient clipping if clip is positive
                grads, vs     = zip(*optimizer.compute_gradients(loss))
                grads, gnorm  = tf.clip_by_global_norm(grads, clip)
                self.train_op = optimizer.apply_gradients(zip(grads, vs))
            else:
                self.train_op = optimizer.minimize(loss)
Example #22
0
    def __init__(self,
                 length_batch,
                 features_batch,
                 labels_batch):
        self.labels_flat = tf.reshape(labels_batch, [-1])
        self.labels_one_hot = tf.one_hot(labels_batch, 26)
        self.labels_one_hot_flat = tf.reshape(self.labels_one_hot, [-1, 26])

        self.lstm = tf.nn.rnn_cell.BasicLSTMCell(128)
        self.lstm_outputs, _ = tf.nn.dynamic_rnn(
            self.lstm, features_batch, sequence_length=length_batch, time_major=False, dtype=tf.float32)
        self.flat_lstm_outputs = tf.reshape(self.lstm_outputs, [-1, 128])
        self.outputs = tflearn.fully_connected(self.flat_lstm_outputs, 26)

        # mask out padding
        self.losses = tf.nn.softmax_cross_entropy_with_logits(self.outputs, self.labels_one_hot_flat)
        self.mask = tf.to_float(tf.sign(self.labels_flat))
        self.masked_losses = self.mask * self.losses
        self.mean_loss = tf.reduce_sum(self.masked_losses / tf.reduce_sum(self.mask))

        self.predictions = tf.argmax(self.outputs, 1)
        self.accurate = tf.equal(self.predictions, self.labels_flat)
        self.accuracy = tf.reduce_sum(tf.to_float(self.accurate) * self.mask) / tf.reduce_sum(self.mask)

        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.mean_loss, tvars), 5.0)

        self.train = tf.train.GradientDescentOptimizer(0.1).apply_gradients(zip(grads, tvars))
def make_train_op(local_net, global_net):
  """
  Use gradients from local network to update the global network
  """

  # Idea:
  # We want a list of gradients and corresponding variables
  # e.g. [[g1, g2, g3], [v1, v2, v3]]
  # Since that's what the optimizer expects.
  # But we would like the gradients to come from the local network
  # And the variables to come from the global network
  # So we want to make a list like this:
  # [[local_g1, local_g2, local_g3], [global_v1, global_v2, global_v3]]

  # First get only the gradients
  local_grads, _ = zip(*local_net.grads_and_vars)

  # Clip gradients to avoid large values
  local_grads, _ = tf.clip_by_global_norm(local_grads, 5.0)

  # Get global vars
  _, global_vars = zip(*global_net.grads_and_vars)

  # Combine local grads and global vars
  local_grads_global_vars = list(zip(local_grads, global_vars))

  # Run a gradient descent step, e.g.
  # var = var - learning_rate * grad
  return global_net.optimizer.apply_gradients(
    local_grads_global_vars,
    global_step=tf.train.get_global_step())
  def defineTensorGradientDescent(self):
    self._learningRate = tf.Variable(0.0, trainable=False)

    trainingVars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainingVars),self.config.max_grad_norm)
    optimizer = tf.train.AdamOptimizer(self.learningRate)
    self._tensorGradientDescentTrainingOperation = optimizer.apply_gradients(zip(grads, trainingVars))
Example #25
0
  def __init__(self, vocab_size, size, num_layers, max_gradient_norm, batch_size, learning_rate,
               learning_rate_decay_factor, dropout, forward_only=False):

    self.size = size
    self.vocab_size = vocab_size
    self.batch_size = batch_size
    self.num_layers = num_layers
    self.keep_prob = 1.0 - dropout
    self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
    self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False)

    self.source_tokens = tf.placeholder(tf.int32, shape=[None, self.batch_size], name="source_tokens")
    self.target_tokens = tf.placeholder(tf.int32, shape=[None, self.batch_size], name="target_tokens")
    self.source_mask = tf.placeholder(tf.int32, shape=[None, self.batch_size], name="source_mask")
    self.target_mask = tf.placeholder(tf.int32, shape=[None, self.batch_size], name="target_mask")
    self.source_length = tf.reduce_sum(self.source_mask, reduction_indices=0)
    self.target_length = tf.reduce_sum(self.target_mask, reduction_indices=0)

    self.setup_embeddings()
    self.setup_encoder()
    self.setup_decoder()
    self.setup_loss()

    params = tf.trainable_variables()
    if not forward_only:
      opt = tf.train.AdamOptimizer(self.learning_rate)

      gradients = tf.gradients(self.losses, params)
      clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm)
      self.gradient_norms = norm
      self.updates = opt.apply_gradients(
        zip(clipped_gradients, params), global_step=self.global_step)

    self.saver = tf.train.Saver(tf.all_variables())
Example #26
0
    def fit(self, data_function):
        with tf.Graph().as_default(), tf.Session() as sess:
            n, s, p = data_function.train.X.shape
            X_pl = tf.placeholder(tf.float32, [self.batch_size, s, p])
            Y_pl = tf.placeholder(tf.float32, [self.batch_size, p])
            lstm_cell = rnn_cell.BasicLSTMCell(self.hidden_size)
            cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers)
            outputs, _ = rnn.rnn(cell, [X_pl[:,i,:] for i in xrange(s)],
                dtype = tf.float32)
            
            softmax_w = tf.get_variable("softmax_w", [self.hidden_size, p])
            softmax_b = tf.get_variable("softmax_b", [p])
            logits = tf.matmul(outputs[-1], softmax_w) + softmax_b
            loss = loss_dict['ce'](logits, Y_pl)
            tvars = tf.trainable_variables()
            print([i.get_shape() for i in tvars])
            grads, _ = tf.clip_by_global_norm(tf.gradients(loss,
                tvars), self.max_grad_norm)
            optimizer = tf.train.AdamOptimizer()
            train_op  = optimizer.apply_gradients(zip(grads, tvars))

            initializer = tf.random_uniform_initializer(-self.init_scale,
                    self.init_scale)
            tf.initialize_all_variables().run()
            for i in xrange(self.n_step):
                batch_xs, batch_ys = data_function.train.next_batch(
                                        self.batch_size)
                feed_dict = {X_pl: batch_xs, Y_pl: batch_ys}
                _, loss_value = sess.run([train_op, loss], 
                        feed_dict = feed_dict)
                if i % 100 == 0:
                    PrintMessage(data_function.train.epochs_completed, 
                            loss_value , 0, 0)
Example #27
0
def training(hypes, loss, global_step, learning_rate, opt=None):
    """Sets up the training Ops.

    Creates a summarizer to track the loss over time in TensorBoard.

    Creates an optimizer and applies the gradients to all trainable variables.

    The Op returned by this function is what must be passed to the
    `sess.run()` call to cause the model to train.

    Args:
      loss: Loss tensor, from loss().
      global_step: Integer Variable counting the number of training steps
        processed.
      learning_rate: The learning rate to use for gradient descent.

    Returns:
      train_op: The Op for training.
    """
    # Add a scalar summary for the snapshot loss.''
    sol = hypes["solver"]
    hypes['tensors'] = {}
    hypes['tensors']['global_step'] = global_step
    total_loss = loss['total_loss']
    with tf.name_scope('training'):

        if opt is None:

            if sol['opt'] == 'RMS':
                opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                                decay=0.9,
                                                epsilon=sol['epsilon'])
            elif sol['opt'] == 'Adam':
                opt = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                             epsilon=sol['adam_eps'])
            elif sol['opt'] == 'SGD':
                lr = learning_rate
                opt = tf.train.GradientDescentOptimizer(learning_rate=lr)
            else:
                raise ValueError('Unrecognized opt type')

        hypes['opt'] = opt

        grads_and_vars = opt.compute_gradients(total_loss)

        if hypes['clip_norm'] > 0:
            grads, tvars = zip(*grads_and_vars)
            clip_norm = hypes["clip_norm"]
            clipped_grads, norm = tf.clip_by_global_norm(grads, clip_norm)
            grads_and_vars = zip(clipped_grads, tvars)

        train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        with tf.control_dependencies(update_ops):
            train_op = opt.apply_gradients(grads_and_vars,
                                           global_step=global_step)

    return train_op
Example #28
0
 def __init__(self, model, optimizer, learning_rate, clip_gradients=5.0):
     """Build a trainer part of graph.
     
     Args:
       model: Model object, that has loss and global_step attributes.
       optimizer: Name of the optimizer class (SGD, Adam, Adagrad) or class.
     """
     self.model = model
     self._learning_rate = tf.get_variable(
         "learning_rate",
         [],
         initializer=tf.constant_initializer(learning_rate))
     params = tf.trainable_variables()
     self.gradients = tf.gradients(model.loss, params)
     if clip_gradients > 0.0:
       self.gradients, self.gradients_norm = tf.clip_by_global_norm(
           self.gradients, clip_gradients)
     grads_and_vars = zip(self.gradients, params)
     if isinstance(optimizer, str):
       self._optimizer = OPTIMIZER_CLS_NAMES[optimizer](self._learning_rate)
     else:
       self._optimizer = optimizer(self.learning_rate)
     self.trainer = self._optimizer.apply_gradients(grads_and_vars,
                                                    global_step=model.global_step,
                                                    name="train")
     # Get all initializers for all trainable variables.
     self._initializers = tf.initialize_all_variables()
Example #29
0
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
  """Creates an optimizer training op."""
  global_step = tf.train.get_or_create_global_step()

  learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

  # Implements linear decay of the learning rate.
  learning_rate = tf.train.polynomial_decay(
      learning_rate,
      global_step,
      num_train_steps,
      end_learning_rate=0.0,
      power=1.0,
      cycle=False)

  # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
  # learning rate will be `global_step/num_warmup_steps * init_lr`.
  if num_warmup_steps:
    global_steps_int = tf.cast(global_step, tf.int32)
    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

    global_steps_float = tf.cast(global_steps_int, tf.float32)
    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    warmup_percent_done = global_steps_float / warmup_steps_float
    warmup_learning_rate = init_lr * warmup_percent_done

    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
    learning_rate = (
        (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

  # It is recommended that you use this optimizer for fine tuning, since this
  # is how the model was trained (note that the Adam m/v variables are NOT
  # loaded from init_checkpoint.)
  optimizer = AdamWeightDecayOptimizer(
      learning_rate=learning_rate,
      weight_decay_rate=0.01,
      beta_1=0.9,
      beta_2=0.999,
      epsilon=1e-6,
      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

  if use_tpu:
    optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

  tvars = tf.trainable_variables()
  grads = tf.gradients(loss, tvars)

  # This is how the model was pre-trained.
  (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

  train_op = optimizer.apply_gradients(
      zip(grads, tvars), global_step=global_step)

  # Normally the global step update is done inside of `apply_gradients`.
  # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
  # a different optimizer, you should probably take this line out.
  new_global_step = global_step + 1
  train_op = tf.group(train_op, [global_step.assign(new_global_step)])
  return train_op
Example #30
0
 def get_graph(self, tensor_input=0):
   #
   # Define variables 
   #  - weights & bias on cells
   #  - memory of previous cell's values
   #  - output classifier
   #
   self.wCells = tf.Variable(tf.truncated_normal([self.nbInputs+self.nbCells, self.nbCells*4], -0.1, 0.1))
   self.bCells = tf.Variable(tf.zeros([1, self.nbCells*4]))
   saved_output = tf.Variable(tf.truncated_normal([self.batchSize, self.nbCells], -0.1, 0.1), trainable=False)
   saved_state  = tf.Variable(tf.truncated_normal([self.batchSize, self.nbCells], -0.1, 0.1), trainable=False)
   wClassif = tf.Variable(tf.truncated_normal([self.nbCells, self.nbOutputs], -0.1, 0.1))
   bClassif = tf.Variable(tf.zeros([self.nbOutputs]))
   self.train_labels = tf.placeholder(tf.float32, shape=[1,self.nbOutputs])
   
   
   
   # Feed <nbInputs> inputs to <nbCells> LSTM cells
   # which have <self.nbFrames> consecutive LSTMs
   # LSTM_inputs = list()
   # for _ in range(self.nbFrames):
   #   LSTM_inputs.append(
   #     tf.placeholder(tf.float32, shape=[self.batchSize,self.nbInputs]))
   
   # if tensor_input != 0:
   #   LSTM_inputs = list()
   #   for _ in range(self.nbFrames):
   #     LSTM_inputs.append(
   #       tensor_input)
   
   
   # Propagate images into LSTM cells
   # for fc6 in LSTM_inputs:
     # saved_output, saved_state = self.lstm_cell(fc6, saved_output, saved_state)
   saved_output, saved_state = self.lstm_cell(tensor_input, saved_output, saved_state)
   
   # State saving across unrollings. 
   # control_dependencies => must be true to continue
   # with tf.control_dependencies([saved_output.assign(output),
   #                               saved_state.assign(state)]):
   
   # Classifier.
   
   self.logits = tf.nn.xw_plus_b(saved_output, wClassif, bClassif)
   self.loss = tf.reduce_mean(
     tf.nn.softmax_cross_entropy_with_logits(
       self.logits, self.train_labels))
   
   # Predictions.
   self.train_prediction = tf.nn.softmax(self.logits)
   
   # Optimizer.
   global_step = tf.Variable(0)
   self.learning_rate = tf.train.exponential_decay(
     50.0, global_step, 5000, 0.8, staircase=True)
   self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
   gradients, v = zip(*self.optimizer.compute_gradients(self.loss))
   gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
   self.optimizer = self.optimizer.apply_gradients(
     zip(gradients, v), global_step=global_step)
    def __init__(self,
                 num_emb,
                 batch_size,
                 emb_dim,
                 hidden_dim,
                 sequence_length,
                 start_token,
                 good_id,
                 pos,
                 learning_rate=0.01,
                 reward_gamma=0.95):
        self.num_emb = num_emb
        self.batch_size = batch_size
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.sequence_length = sequence_length
        self.start_token = tf.constant([start_token] * self.batch_size,
                                       dtype=tf.int32)
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.reward_gamma = reward_gamma
        self.g_params = []
        self.d_params = []
        self.good_id = tf.constant(good_id * self.batch_size, dtype=tf.int32)
        self.temperature = 2
        self.grad_clip = 5.0
        self.pos = pos

        self.expected_reward = tf.Variable(tf.zeros([self.sequence_length]))

        with tf.variable_scope('generator'):
            self.g_embeddings = tf.Variable(
                self.init_matrix_embedding([self.num_emb, self.emb_dim],
                                           self.pos))
            self.g_params.append(self.g_embeddings)
            self.g_recurrent_unit = self.create_recurrent_unit(
                self.g_params)  # maps h_tm1 to h_t for generator
            self.g_output_unit = self.create_output_unit(
                self.g_params)  # maps h_t to o_t (output token logits)

        # placeholder definition
        self.x = tf.placeholder(tf.int32,
                                shape=[self.batch_size, self.sequence_length])
        # sequence of indices of true data, not including start token

        self.rewards = tf.placeholder(
            tf.float32, shape=[self.batch_size, self.sequence_length])
        # get from rollout policy and discriminator

        # processed for batch
        with tf.device("/cpu:0"):
            inputs = tf.split(
                1, self.sequence_length,
                tf.nn.embedding_lookup(self.g_embeddings, self.x))
            self.processed_x = tf.pack([
                tf.squeeze(input_, [1]) for input_ in inputs
            ])  # seq_length x batch_size x emb_dim
        with tf.device("/cpu:0"):
            inputs = tf.split(1, self.sequence_length, self.x)
            self.processed_token_x = tf.pack(
                [tf.squeeze(input_, [1]) for input_ in inputs])

        self.h0 = tf.zeros([self.batch_size, self.hidden_dim])
        self.h0 = tf.pack([self.h0, self.h0])

        gen_o = tensor_array_ops.TensorArray(dtype=tf.float32,
                                             size=self.sequence_length,
                                             dynamic_size=False,
                                             infer_shape=True)
        gen_x = tensor_array_ops.TensorArray(dtype=tf.int32,
                                             size=self.sequence_length,
                                             dynamic_size=False,
                                             infer_shape=True)

        ta_emb_x2 = tensor_array_ops.TensorArray(dtype=tf.int32,
                                                 size=self.sequence_length)
        ta_emb_x2 = ta_emb_x2.unpack(self.processed_token_x)

        x_t = tf.nn.embedding_lookup(self.g_embeddings, self.start_token)
        h_t1 = self.g_recurrent_unit(x_t, self.h0)  # hidden_memory_tuple
        o_t = self.g_output_unit(h_t1)  # batch x vocab , logits not prob
        log_prob = tf.divide(tf.log(tf.nn.softmax(o_t)), self.temperature)
        next_token = tf.cast(tf.reshape(ta_emb_x2.read(0), [self.batch_size]),
                             tf.int32)
        #next_token = tf.cast(tf.reshape(tf.multinomial(log_prob, 1), [self.batch_size]), tf.int32)
        x_tp1 = tf.nn.embedding_lookup(self.g_embeddings,
                                       next_token)  # batch x emb_dim
        gen_o = gen_o.write(
            tf.constant(0, dtype=tf.int32),
            tf.reduce_sum(
                tf.mul(tf.one_hot(next_token, self.num_emb, 1.0, 0.0),
                       tf.nn.softmax(o_t)), 1))  # [batch_size] , prob
        gen_x = gen_x.write(tf.constant(0, dtype=tf.int32),
                            next_token)  # indices, batch_size

        #random sampling:
        '''
        x_t = tf.nn.embedding_lookup(self.g_embeddings,self.start_token)
        h_t1 = self.g_recurrent_unit(x_t, self.h0)  # hidden_memory_tuple
        o_t = self.g_output_unit(h_t1)  # batch x vocab , logits not prob
        next_token = tf.multinomial(tf.nn.softmax(o_t),1)
        next_token = tf.cast(tf.reshape(tf.multinomial(tf.nn.softmax(o_t),1),[self.batch_size]),tf.int32)    
        x_tp1 = tf.nn.embedding_lookup(self.g_embeddings, next_token)  # batch x emb_dim
        gen_o = gen_o.write(tf.constant(0,dtype=tf.int32), tf.reduce_sum(tf.mul(tf.one_hot(next_token, self.num_emb, 1.0, 0.0),tf.nn.softmax(o_t)), 1))  # [batch_size] , prob
        gen_x = gen_x.write(tf.constant(0,dtype=tf.int32), next_token)  # indices, batch_size
        '''
        def _g_recurrence(i, x_t, h_tm1, gen_o, gen_x):
            h_t = self.g_recurrent_unit(x_t, h_tm1)  # hidden_memory_tuple
            o_t = self.g_output_unit(h_t)  # batch x vocab , logits not prob
            print('now here')
            next_token = tf.argmax(tf.nn.softmax(o_t), 1)
            next_token = tf.cast(
                tf.reshape(tf.argmax(tf.nn.softmax(o_t), 1),
                           [self.batch_size]), tf.int32)
            x_tp1 = tf.nn.embedding_lookup(self.g_embeddings, next_token)
            gen_o = gen_o.write(
                i,
                tf.reduce_sum(
                    tf.mul(tf.one_hot(next_token, self.num_emb, 1.0, 0.0),
                           tf.nn.softmax(o_t)), 1))  # [batch_size] , prob
            gen_x = gen_x.write(i, next_token)  # indices, batch_size
            return i + 1, x_tp1, h_t, gen_o, gen_x

        _, _, _, self.gen_o, self.gen_x = tf.while_loop(
            cond=lambda i, _1, _2, _3, _4: i < self.sequence_length,
            body=_g_recurrence,
            loop_vars=(tf.constant(1,
                                   dtype=tf.int32), x_tp1, h_t1, gen_o, gen_x))

        self.gen_x = self.gen_x.pack()  # seq_length x batch_size
        self.gen_x = tf.transpose(self.gen_x,
                                  perm=[1, 0])  # batch_size x seq_length

        self.h0 = tf.zeros([self.batch_size, self.hidden_dim])
        self.h0 = tf.pack([self.h0, self.h0])

        gen_o_argmax = tensor_array_ops.TensorArray(dtype=tf.float32,
                                                    size=self.sequence_length,
                                                    dynamic_size=False,
                                                    infer_shape=True)
        gen_x_argmax = tensor_array_ops.TensorArray(dtype=tf.int32,
                                                    size=self.sequence_length,
                                                    dynamic_size=False,
                                                    infer_shape=True)

        def _g_recurrence_argmax(i, x_t, h_tm1, gen_o, gen_x):
            h_t = self.g_recurrent_unit(x_t, h_tm1)  # hidden_memory_tuple
            o_t = self.g_output_unit(h_t)  # batch x vocab , logits not prob

            next_token = tf.argmax(tf.nn.softmax(o_t), 1)
            next_token = tf.cast(
                tf.reshape(tf.argmax(tf.nn.softmax(o_t), 1),
                           [self.batch_size]), tf.int32)

            x_tp1 = tf.nn.embedding_lookup(self.g_embeddings,
                                           next_token)  # batch x emb_dim
            print(x_tp1)
            gen_o = gen_o.write(
                i,
                tf.reduce_sum(
                    tf.mul(tf.one_hot(next_token, self.num_emb, 1.0, 0.0),
                           tf.nn.softmax(o_t)), 1))  # [batch_size] , prob
            gen_x = gen_x.write(i, next_token)  # indices, batch_size
            return i + 1, x_tp1, h_t, gen_o, gen_x

        _, _, _, self.gen_o_argmax, self.gen_x_argmax = tf.while_loop(
            cond=lambda i, _1, _2, _3, _4: i < self.sequence_length,
            body=_g_recurrence_argmax,
            loop_vars=(tf.constant(0, dtype=tf.int32),
                       tf.nn.embedding_lookup(self.g_embeddings,
                                              self.start_token), self.h0,
                       gen_o_argmax, gen_x_argmax))

        self.gen_x_argmax = self.gen_x_argmax.pack()  # seq_length x batch_size
        self.gen_x_argmax = tf.transpose(self.gen_x_argmax,
                                         perm=[1,
                                               0])  # batch_size x seq_length
        ###################################################################
        # wgan pretraining for generator
        g_predictions_wgan = tensor_array_ops.TensorArray(
            dtype=tf.float32,
            size=self.sequence_length,
            dynamic_size=False,
            infer_shape=True)

        def _wgantrain_recurrence(i, x_t, h_tm1, g_predictions_wgan):
            h_t = self.g_recurrent_unit(x_t, h_tm1)
            o_t = self.g_output_unit(h_t)
            g_predictions_wgan = g_predictions_wgan.write(
                i, tf.nn.softmax(o_t))  # batch x vocab_size
            next_token = tf.argmax(tf.nn.softmax(o_t), 1)
            x_tp1 = tf.nn.embedding_lookup(self.g_embeddings, next_token)
            #x_tp1 = ta_emb_x.read(i)
            return i + 1, x_tp1, h_t, g_predictions_wgan

        _, _, _, self.g_predictions_wgan = tf.while_loop(
            cond=lambda i, _1, _2, _3: i < self.sequence_length,
            body=_wgantrain_recurrence,
            loop_vars=(tf.constant(0, dtype=tf.int32),
                       tf.nn.embedding_lookup(self.g_embeddings,
                                              self.start_token), self.h0,
                       g_predictions_wgan))

        self.g_predictions_wgan = tf.transpose(
            self.g_predictions_wgan.pack(),
            perm=[1, 0, 2])  # batch_size x seq_length x vocab_size

        ###################################################################
        # supervised pretraining for generator
        g_predictions = tensor_array_ops.TensorArray(dtype=tf.float32,
                                                     size=self.sequence_length,
                                                     dynamic_size=False,
                                                     infer_shape=True)

        ta_emb_x = tensor_array_ops.TensorArray(dtype=tf.float32,
                                                size=self.sequence_length)
        ta_emb_x = ta_emb_x.unpack(self.processed_x)

        def _pretrain_recurrence(i, x_t, h_tm1, g_predictions):
            h_t = self.g_recurrent_unit(x_t, h_tm1)
            o_t = self.g_output_unit(h_t)
            g_predictions = g_predictions.write(
                i, tf.nn.softmax(o_t))  # batch x vocab_size
            x_tp1 = ta_emb_x.read(i)
            return i + 1, x_tp1, h_t, g_predictions

        _, _, _, self.g_predictions = tf.while_loop(
            cond=lambda i, _1, _2, _3: i < self.sequence_length,
            body=_pretrain_recurrence,
            loop_vars=(tf.constant(0, dtype=tf.int32),
                       tf.nn.embedding_lookup(self.g_embeddings,
                                              self.start_token), self.h0,
                       g_predictions))

        self.g_predictions = tf.transpose(
            self.g_predictions.pack(),
            perm=[1, 0, 2])  # batch_size x seq_length x vocab_size

        # pretraining loss
        self.pretrain_loss = -tf.reduce_sum(
            tf.one_hot(tf.to_int32(tf.reshape(
                self.x, [-1])), self.num_emb, 1.0, 0.0) * tf.log(
                    tf.clip_by_value(
                        tf.reshape(self.g_predictions, [-1, self.num_emb]),
                        1e-20, 1.0))) / (self.sequence_length *
                                         self.batch_size)

        # training updates
        pretrain_opt = self.g_optimizer(self.learning_rate)

        self.pretrain_grad, _ = tf.clip_by_global_norm(
            tf.gradients(self.pretrain_loss, self.g_params), self.grad_clip)
        self.pretrain_updates = pretrain_opt.apply_gradients(
            zip(self.pretrain_grad, self.g_params))

        #######################################################################################################
        #  Unsupervised Training
        #######################################################################################################
        self.g_loss = -tf.reduce_sum(
            tf.reduce_sum(
                tf.one_hot(tf.to_int32(tf.reshape(
                    self.x, [-1])), self.num_emb, 1.0, 0.0) * tf.log(
                        tf.clip_by_value(
                            tf.reshape(self.g_predictions, [-1, self.num_emb]),
                            1e-20, 1.0)), 1) * tf.reshape(self.rewards, [-1]))

        g_opt = self.g_optimizer(self.learning_rate)

        self.g_grad, _ = tf.clip_by_global_norm(
            tf.gradients(self.g_loss, self.g_params), self.grad_clip)
        self.g_updates = g_opt.apply_gradients(zip(self.g_grad, self.g_params))
Example #32
0
    def __init__(self,
                 batchloader,
                 is_training=True,
                 without_label=False,
                 ru=False):
        self.batchloader = batchloader
        self.ru = ru
        self.is_training = is_training
        self.without_label = without_label

        self.lr = tf.placeholder(tf.float32, shape=(), name="learning_rate")
        self.gumbel_temperature = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name="gumbel_temperature")

        with tf.name_scope("Placeholders"):
            self.encoder_input = tf.placeholder(tf.int64,
                                                shape=(FLAGS.BATCH_SIZE,
                                                       FLAGS.SEQ_LEN),
                                                name="encoder_input")

            self.decoder_input = tf.placeholder(tf.int64,
                                                shape=(FLAGS.BATCH_SIZE,
                                                       FLAGS.SEQ_LEN),
                                                name="decoder_input")

            self.target = tf.placeholder(tf.int64,
                                         shape=(FLAGS.BATCH_SIZE,
                                                FLAGS.SEQ_LEN),
                                         name="target")

            encoder_input_t = tf.transpose(self.encoder_input, perm=[1, 0])
            self.encoder_input_list = []
            decoder_input_t = tf.transpose(self.decoder_input, perm=[1, 0])
            self.decoder_input_list = []
            target_t = tf.transpose(self.target, perm=[1, 0])
            self.target_list = []

            self.step = tf.placeholder(tf.float32, shape=(), name="step")

            for i in range(FLAGS.SEQ_LEN):
                self.encoder_input_list.append(encoder_input_t[i])
                assert self.encoder_input_list[i].shape == (FLAGS.BATCH_SIZE)

                self.decoder_input_list.append(decoder_input_t[i])
                assert self.decoder_input_list[i].shape == (FLAGS.BATCH_SIZE)

                self.target_list.append(target_t[i])
                assert self.target_list[i].shape == (FLAGS.BATCH_SIZE)

            if not without_label:
                self.label = tf.placeholder(tf.int64,
                                            shape=(FLAGS.BATCH_SIZE),
                                            name="label")

                self.label_onehot = tf.one_hot(self.label,
                                               FLAGS.LABEL_CLASS,
                                               name="label_onehot")
                assert self.label_onehot.shape == (FLAGS.BATCH_SIZE,
                                                   FLAGS.LABEL_CLASS)

        with tf.variable_scope("Embedding"):
            self.embedding = tf.get_variable(
                name="embedding",
                shape=[FLAGS.VOCAB_SIZE, FLAGS.EMBED_SIZE],
                dtype=tf.float32,
                initializer=tf.random_normal_initializer(stddev=0.1))

        with tf.variable_scope("Encoder"):
            self.encoder = Encoder[FLAGS.ENCODER_NAME](
                self.embedding,
                self.encoder_input_list,
                is_training=self.is_training,
                ru=self.ru)

        with tf.variable_scope("Discriminator"):
            self.discriminator = Discriminator(self.encoder.encoder_rnn_output,
                                               self.gumbel_temperature)

            if self.without_label:
                self.label_onehot = self.discriminator.discriminator_sampling_onehot
                assert self.label_onehot.shape == (FLAGS.BATCH_SIZE,
                                                   FLAGS.LABEL_CLASS)

        with tf.name_scope("Latent_variables"):
            self.sampler = Sampler(self.encoder.encoder_rnn_output,
                                   self.label_onehot,
                                   is_training=self.is_training)

            if self.is_training:
                self.latent_variables = self.sampler.latent_variables
            else:
                self.latent_variables = tf.placeholder(
                    tf.float32,
                    shape=(FLAGS.BATCH_SIZE, FLAGS.LATENT_VARIABLE_SIZE),
                    name="latent_variables_input")

        with tf.variable_scope("Decoder"):
            self.decoder = Decoder[FLAGS.DECODER_NAME](
                self.decoder_input,
                self.latent_variables,
                self.label_onehot,
                self.embedding,
                self.batchloader,
                is_training=self.is_training,
                ru=self.ru)

        with tf.name_scope("Loss"):
            if not self.without_label:
                discriminator_correct = tf.equal(
                    self.discriminator.discriminator_predict, self.label)
                self.discriminator_accuracy = tf.reduce_mean(
                    tf.cast(discriminator_correct, tf.float32))
                self.discriminator_loss = tf.reduce_mean(
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.discriminator.discriminator_logits,
                        labels=self.label,
                        name="labeled_discriminator_cross_entropy")
                ) * FLAGS.SEQ_LEN
            else:
                true_y = tf.fill([FLAGS.BATCH_SIZE, FLAGS.LABEL_CLASS],
                                 1 / FLAGS.LABEL_CLASS,
                                 name="true_y_distribution")
                self.kld2 = tf.reduce_mean(
                                tf.reduce_sum(self.discriminator.discriminator_prob * \
                                                (tf.log(self.discriminator.discriminator_prob + 1e-6) - \
                                                 tf.log(true_y)),
                                              axis=1))

            self.logits = self.decoder.logits

            self.kld = tf.reduce_mean(
                -0.5 * tf.reduce_sum(self.sampler.logvar - tf.square(
                    self.sampler.mu) - tf.exp(self.sampler.logvar) + 1,
                                     axis=1))
            self.kld_weight = tf.clip_by_value(
                FLAGS.INIT_KLD_WEIGHT + (1 - FLAGS.INIT_KLD_WEIGHT) *
                (self.step - FLAGS.KLD_ANNEAL_START) /
                (FLAGS.KLD_ANNEAL_END - FLAGS.KLD_ANNEAL_START), 0, 1)

            reconst_losses = [tf.nn.sparse_softmax_cross_entropy_with_logits( \
                                                    logits=logits, labels=targets) \
                                        for logits, targets in zip(self.logits, self.target_list)]
            self.reconst_loss = tf.reduce_mean(reconst_losses) * FLAGS.SEQ_LEN

            if not self.without_label:
                self.loss = self.reconst_loss + self.kld_weight * self.kld \
                                + tf.log(1/FLAGS.LABEL_CLASS) + self.discriminator_loss
            else:
                self.loss = self.reconst_loss + self.kld_weight * self.kld + self.kld2

        with tf.name_scope("Summary"):
            if self.is_training and not self.without_label:
                reconst_loss_summary = tf.summary.scalar(
                    "labeled_reconst_loss",
                    self.reconst_loss,
                    family="train_loss")
                kld_summary = tf.summary.scalar("labeled_kld",
                                                self.kld,
                                                family="kld")
                disc_loss_summary = tf.summary.scalar(
                    "labeled_disc_train_loss",
                    self.discriminator_loss,
                    family="disc_loss")
                disc_acc_summary = tf.summary.scalar(
                    "labeled_disc_train_acc",
                    self.discriminator_accuracy,
                    family="disc_acc")

                kld_weight_summary = tf.summary.scalar("kld_weight",
                                                       self.kld_weight,
                                                       family="parameters")
                mu_summary = tf.summary.histogram(
                    "labeled_mu", tf.reduce_mean(self.sampler.mu, 0))
                var_summary = tf.summary.histogram(
                    "labeled_var",
                    tf.reduce_mean(tf.exp(self.sampler.logvar), 0))
                lr_summary = tf.summary.scalar("lr",
                                               self.lr,
                                               family="parameters")

                self.merged_summary = tf.summary.merge([
                    reconst_loss_summary, kld_summary, disc_loss_summary,
                    disc_acc_summary, kld_weight_summary, mu_summary,
                    var_summary, lr_summary
                ])
            elif self.is_training and self.without_label:
                reconst_loss_summary = tf.summary.scalar(
                    "unlabeled_reconst_loss",
                    self.reconst_loss,
                    family="train_loss")
                kld_summary = tf.summary.scalar("unlabeled_kld",
                                                self.kld,
                                                family="kld")
                gumbel_summary = tf.summary.scalar("gumbel_temperature",
                                                   self.gumbel_temperature,
                                                   family="parameters")
                kld2_summary = tf.summary.scalar("unlabeled_kld2",
                                                 self.kld2,
                                                 family="kld")

                mu_summary = tf.summary.histogram(
                    "unlabeled_mu", tf.reduce_mean(self.sampler.mu, 0))
                var_summary = tf.summary.histogram(
                    "unlabeled_var",
                    tf.reduce_mean(tf.exp(self.sampler.logvar), 0))

                self.merged_summary = tf.summary.merge([
                    reconst_loss_summary, kld_summary, gumbel_summary,
                    kld2_summary, mu_summary, var_summary
                ])
            else:
                valid_reconst_loss_summary = tf.summary.scalar(
                    "valid_reconst_loss",
                    self.reconst_loss,
                    family="valid_loss")
                disc_loss_summary = tf.summary.scalar("disc_valid_loss",
                                                      self.discriminator_loss,
                                                      family="disc_loss")
                disc_acc_summary = tf.summary.scalar(
                    "disc_valid_acc",
                    self.discriminator_accuracy,
                    family="disc_acc")

                self.merged_summary = tf.summary.merge([
                    valid_reconst_loss_summary, disc_loss_summary,
                    disc_acc_summary
                ])

        if self.is_training:
            tvars = tf.trainable_variables()
            with tf.name_scope("Optimizer"):
                tvars = tf.trainable_variables()
                grads, _ = tf.clip_by_global_norm(
                    tf.gradients(self.loss, tvars), FLAGS.MAX_GRAD)
                optimizer = tf.train.AdamOptimizer(self.lr, beta1=0.5)

                self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Example #33
0
        y_logits2 = dense(1, name='predictions2')(x_de2)

        y_logits = tf.concat([y_logits0, y_logits1, y_logits2], 1)

        n_samples = tf.reduce_sum(tf.count_nonzero(m, axis=1,
                                                   dtype=tf.float32))

    with tf.variable_scope('loss'):
        masked_cross_entropy = masked_sigmoid_cross_entropy_with_logits(
            logits=y_logits, labels=y, masks=m)
        loss = tf.reduce_sum(masked_cross_entropy) / n_samples

    with tf.variable_scope('train'):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        gradients, variables = zip(*optimizer.compute_gradients(loss))
        gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        optimize = optimizer.apply_gradients(zip(gradients, variables))

    with tf.variable_scope('performance'):
        probabilities = tf.multiply(tf.sigmoid(y_logits), m)
        prediction = tf.round(probabilities)
        correct_prediction = tf.reduce_sum(
            tf.multiply(tf.cast(tf.equal(prediction, y), dtype=tf.float32), m))
        accuracy = correct_prediction / n_samples

    with tf.variable_scope('tasks'):
        t0 = tf.constant([1, 0, 0], dtype=tf.float32)  # psoriasis
        t1 = tf.constant([0, 1, 0], dtype=tf.float32)  # acne or rosacea
        t2 = tf.constant([0, 0, 1], dtype=tf.float32)  # mycosis
        t02 = tf.constant([1, 0, 1], dtype=tf.float32)  # eczema
Example #34
0
File: tfpolicy.py Project: zcli/ray
 def setup_gradients(self):
     grads = tf.gradients(self.loss, self.var_list)
     self.grads, _ = tf.clip_by_global_norm(grads, 40.0)
     grads_and_vars = list(zip(self.grads, self.var_list))
     opt = tf.train.AdamOptimizer(1e-4)
     self._apply_gradients = opt.apply_gradients(grads_and_vars)
Example #35
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
      dtype: the data type to use to store internal variables.
    """

        # self.dtype = tf.float32
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), tf.float32)

        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w_t = tf.get_variable("proj_w", [self.target_vocab_size, size],
                                  tf.float32)
            w = tf.transpose(w_t)
            b = tf.get_variable("proj_b", [self.target_vocab_size], tf.float32)
            output_projection = (w, b)

            def sampled_loss(labels, inputs):  #使用候选采样损失函数
                labels = tf.reshape(labels, [-1, 1])
                #需要使用32位浮点数来计算sampled_softmax_loss,以避免数值不稳定性。
                local_w_t = tf.cast(w_t, tf.float32)
                local_b = tf.cast(b, tf.float32)
                local_inputs = tf.cast(inputs, tf.float32)
                return tf.cast(
                    tf.nn.sampled_softmax_loss(
                        weights=local_w_t,
                        biases=local_b,
                        labels=labels,
                        inputs=local_inputs,
                        num_sampled=num_samples,
                        num_classes=self.target_vocab_size), tf.float32)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = tf.contrib.rnn.GRUCell(size)
        if use_lstm:
            single_cell = tf.contrib.rnn.BasicLSTMCell(size,
                                                       state_is_tuple=True)
        cell = single_cell
        if num_layers > 1:
            cell = tf.contrib.rnn.MultiRNNCell([single_cell] * num_layers,
                                               state_is_tuple=True)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                encoder_inputs,  #输入的句子
                decoder_inputs,  #输出的句子
                cell,  #使用的cell,lstm 或者GRU
                num_encoder_symbols=source_vocab_size,  #源字典的大小
                num_decoder_symbols=target_vocab_size,  #转换后的目的字典的大小
                embedding_size=size,  #embedding 的纬度
                output_projection=output_projection,  #看字典大小
                feed_previous=do_decode,  #进行训练还是测试
                dtype=tf.float32)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        #这里为列表对象中的每一个元素表示一个占位符,名称分别为encoder0、encoder1、encoder2...
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))
        # target_weights 是一个与 decoder_outputs 大小一样的 0-1 矩阵。该矩阵将目标序列长度以外的其他位置填充为标量值 0。
        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]
        #将 decoder input向右平铺一个单位

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(
                self.learning_rate)  #使用梯度下降法优化
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b],
                                         params)  #计算损失函数关于参数的梯度
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)  #这里用来防止梯度爆炸
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(
                        zip(clipped_gradients, params),
                        global_step=self.global_step))  #这里用来更新参数

        self.saver = tf.train.Saver(
            tf.global_variables())  # tf.all_variables() depreciated
Example #36
0
    def __init__(self, flags, vocab_size, is_training=True):
        batch_size = flags.batch_size
        unroll = flags.unroll
        self._x = tf.placeholder(tf.int32, [batch_size, unroll])
        self._y = tf.placeholder(tf.int32, [batch_size, unroll])
        self._len = tf.placeholder(tf.int32, [
            None,
        ])

        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(flags.hidden_dim,
                                                 forget_bias=1.0,
                                                 state_is_tuple=True)
        if is_training and flags.drop_prob > 0:
            lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell,
                                                      output_keep_prob=1.0 -
                                                      flags.drop_prob)
        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * flags.layers,
                                           state_is_tuple=True)
        self._initial_state = cell.zero_state(batch_size, tf.float32)

        with tf.device("/cpu:0"):
            self.embeddings = tf.get_variable("embeddings",
                                              [vocab_size, flags.hidden_dim])
            inputs = tf.nn.embedding_lookup(self.embeddings, self._x)
        if is_training and flags.drop_prob > 0:
            inputs = tf.nn.dropout(inputs, 1.0 - flags.drop_prob)

        # These options (fixed unroll or dynamic_rnn) should give same results but
        # using fixed here since faster
        if True:
            outputs = []
            state = self._initial_state
            with tf.variable_scope("RNN"):
                for time_step in range(unroll):
                    if time_step > 0: tf.get_variable_scope().reuse_variables()
                    (cell_output, state) = cell(inputs[:, time_step, :], state)
                    outputs.append(cell_output)
            outputs = tf.reshape(tf.concat(1, outputs), [-1, flags.hidden_dim])
        else:
            with tf.variable_scope("RNN"):
                outputs, state = tf.nn.dynamic_rnn(
                    cell,
                    inputs,
                    sequence_length=self._len,
                    initial_state=self._initial_state,
                    dtype=tf.float32,
                    time_major=False)
            outputs = tf.reshape(outputs, [-1, flags.hidden_dim])

        softmax_w = tf.get_variable("softmax_w",
                                    [flags.hidden_dim, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        logits = tf.matmul(outputs, softmax_w) + softmax_b
        seq_loss = tf.nn.seq2seq.sequence_loss_by_example(
            [tf.reshape(logits, [-1, vocab_size])],
            [tf.reshape(self._y, [-1])], [tf.ones([batch_size * unroll])])
        self.loss = tf.reduce_sum(seq_loss) / batch_size
        self._final_state = state

        if not is_training:
            return

        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        shapes = [tvar.get_shape() for tvar in tvars]
        log_info("# params: %d" % np.sum([np.prod(s) for s in shapes]))
        grads = tf.gradients(self.loss, tvars)
        if flags.clip_norm is not None:
            grads, grads_norm = tf.clip_by_global_norm(grads, flags.clip_norm)
        else:
            grads_norm = tf.global_norm(grads)
        optimizer = get_optimizer(flags.optimizer)(self.lr)
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))

        # Summaries for TensorBoard, note this is only within training portion
        with tf.name_scope("summaries"):
            tf.scalar_summary("loss", self.loss / unroll)
            tf.scalar_summary("learning_rate", self.lr)
            tf.scalar_summary("grads_norm", grads_norm)
Example #37
0
    def __init__(self, user_count, item_count, cate_count, cate_list,
                 predict_batch_size, predict_ads_num):

        self.u = tf.placeholder(tf.int32, [
            None,
        ])  # [B]
        self.i = tf.placeholder(tf.int32, [
            None,
        ])  # [B]
        self.j = tf.placeholder(tf.int32, [
            None,
        ])  # [B]
        self.y = tf.placeholder(tf.float32, [
            None,
        ])  # [B]
        self.hist_i = tf.placeholder(tf.int32, [None, None])  # [B, T]
        self.sl = tf.placeholder(tf.int32, [
            None,
        ])  # [B]
        self.lr = tf.placeholder(tf.float64, [])

        hidden_units = 128

        user_emb_w = tf.get_variable("user_emb_w", [user_count, hidden_units])
        item_emb_w = tf.get_variable("item_emb_w",
                                     [item_count, hidden_units // 2])
        item_b = tf.get_variable("item_b", [item_count],
                                 initializer=tf.constant_initializer(0.0))
        cate_emb_w = tf.get_variable("cate_emb_w",
                                     [cate_count, hidden_units // 2])
        cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64)

        ic = tf.gather(cate_list, self.i)
        i_emb = tf.concat(values=[
            tf.nn.embedding_lookup(item_emb_w, self.i),
            tf.nn.embedding_lookup(cate_emb_w, ic),
        ],
                          axis=1)
        i_b = tf.gather(item_b, self.i)

        jc = tf.gather(cate_list, self.j)
        j_emb = tf.concat([
            tf.nn.embedding_lookup(item_emb_w, self.j),
            tf.nn.embedding_lookup(cate_emb_w, jc),
        ],
                          axis=1)
        j_b = tf.gather(item_b, self.j)

        hc = tf.gather(cate_list, self.hist_i)
        h_emb = tf.concat([
            tf.nn.embedding_lookup(item_emb_w, self.hist_i),
            tf.nn.embedding_lookup(cate_emb_w, hc),
        ],
                          axis=2)

        hist_i = attention(i_emb, h_emb, self.sl)
        #-- attention end ---

        hist_i = tf.layers.batch_normalization(inputs=hist_i)
        hist_i = tf.reshape(hist_i, [-1, hidden_units], name='hist_bn')
        hist_i = tf.layers.dense(hist_i, hidden_units, name='hist_fcn')

        u_emb_i = hist_i

        hist_j = attention(j_emb, h_emb, self.sl)
        #-- attention end ---

        hist_j = tf.layers.batch_normalization(inputs=hist_j, reuse=True)
        hist_j = tf.reshape(hist_j, [-1, hidden_units], name='hist_bn')
        hist_j = tf.layers.dense(hist_j,
                                 hidden_units,
                                 name='hist_fcn',
                                 reuse=True)

        u_emb_j = hist_j
        print(u_emb_i.get_shape().as_list())
        print(u_emb_j.get_shape().as_list())
        print(i_emb.get_shape().as_list())
        print(j_emb.get_shape().as_list())
        #-- fcn begin -------
        din_i = tf.concat([u_emb_i, i_emb, u_emb_i * i_emb], axis=-1)
        din_i = tf.layers.batch_normalization(inputs=din_i, name='b1')
        #d_layer_1_i = tf.layers.dense(din_i, 80, activation=tf.nn.sigmoid, name='f1')
        #if u want try dice change sigmoid to None and add dice layer like following two lines. You can also find model_dice.py in this folder.
        d_layer_1_i = tf.layers.dense(din_i, 80, activation=None, name='f1')
        d_layer_1_i = dice(d_layer_1_i, name='dice_1')
        d_layer_2_i = tf.layers.dense(d_layer_1_i,
                                      40,
                                      activation=None,
                                      name='f2')
        d_layer_2_i = dice(d_layer_2_i, name='dice_2')
        d_layer_3_i = tf.layers.dense(d_layer_2_i,
                                      1,
                                      activation=None,
                                      name='f3')
        din_j = tf.concat([u_emb_j, j_emb, u_emb_j * j_emb], axis=-1)
        din_j = tf.layers.batch_normalization(inputs=din_j,
                                              name='b1',
                                              reuse=True)
        d_layer_1_j = tf.layers.dense(din_j,
                                      80,
                                      activation=None,
                                      name='f1',
                                      reuse=True)
        d_layer_1_j = dice(d_layer_1_j, name='dice_1')
        d_layer_2_j = tf.layers.dense(d_layer_1_j,
                                      40,
                                      activation=None,
                                      name='f2',
                                      reuse=True)
        d_layer_2_j = dice(d_layer_2_j, name='dice_2')
        d_layer_3_j = tf.layers.dense(d_layer_2_j,
                                      1,
                                      activation=None,
                                      name='f3',
                                      reuse=True)
        d_layer_3_i = tf.reshape(d_layer_3_i, [-1])
        d_layer_3_j = tf.reshape(d_layer_3_j, [-1])
        x = i_b - j_b + d_layer_3_i - d_layer_3_j  # [B]
        self.logits = i_b + d_layer_3_i

        # prediciton for selected items
        # logits for selected item:
        item_emb_all = tf.concat(
            [item_emb_w,
             tf.nn.embedding_lookup(cate_emb_w, cate_list)],
            axis=1)
        item_emb_sub = item_emb_all[:predict_ads_num, :]
        item_emb_sub = tf.expand_dims(item_emb_sub, 0)
        item_emb_sub = tf.tile(item_emb_sub, [predict_batch_size, 1, 1])
        hist_sub = attention_multi_items(item_emb_sub, h_emb, self.sl)
        #-- attention end ---

        hist_sub = tf.layers.batch_normalization(inputs=hist_sub,
                                                 name='hist_bn',
                                                 reuse=tf.AUTO_REUSE)
        # print hist_sub.get_shape().as_list()
        hist_sub = tf.reshape(hist_sub, [-1, hidden_units])
        hist_sub = tf.layers.dense(hist_sub,
                                   hidden_units,
                                   name='hist_fcn',
                                   reuse=tf.AUTO_REUSE)

        u_emb_sub = hist_sub
        item_emb_sub = tf.reshape(item_emb_sub, [-1, hidden_units])
        din_sub = tf.concat(
            [u_emb_sub, item_emb_sub, u_emb_sub * item_emb_sub], axis=-1)
        din_sub = tf.layers.batch_normalization(inputs=din_sub,
                                                name='b1',
                                                reuse=True)
        d_layer_1_sub = tf.layers.dense(din_sub,
                                        80,
                                        activation=tf.nn.sigmoid,
                                        name='f1',
                                        reuse=True)
        #d_layer_1_sub = dice(d_layer_1_sub, name='dice_1_sub')
        d_layer_2_sub = tf.layers.dense(d_layer_1_sub,
                                        40,
                                        activation=tf.nn.sigmoid,
                                        name='f2',
                                        reuse=True)
        #d_layer_2_sub = dice(d_layer_2_sub, name='dice_2_sub')
        d_layer_3_sub = tf.layers.dense(d_layer_2_sub,
                                        1,
                                        activation=None,
                                        name='f3',
                                        reuse=True)
        d_layer_3_sub = tf.reshape(d_layer_3_sub, [-1, predict_ads_num])
        self.logits_sub = tf.sigmoid(item_b[:predict_ads_num] + d_layer_3_sub)
        self.logits_sub = tf.reshape(self.logits_sub, [-1, predict_ads_num, 1])
        #-- fcn end -------

        self.mf_auc = tf.reduce_mean(tf.to_float(x > 0))
        self.score_i = tf.sigmoid(i_b + d_layer_3_i)
        self.score_j = tf.sigmoid(j_b + d_layer_3_j)
        self.score_i = tf.reshape(self.score_i, [-1, 1])
        self.score_j = tf.reshape(self.score_j, [-1, 1])
        self.p_and_n = tf.concat([self.score_i, self.score_j], axis=-1)
        print(self.p_and_n.get_shape().as_list())

        # Step variable
        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        self.global_epoch_step = \
            tf.Variable(0, trainable=False, name='global_epoch_step')
        self.global_epoch_step_op = \
            tf.assign(self.global_epoch_step, self.global_epoch_step+1)

        self.loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits,
                                                    labels=self.y))

        trainable_params = tf.trainable_variables()
        self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr)
        gradients = tf.gradients(self.loss, trainable_params)
        clip_gradients, _ = tf.clip_by_global_norm(gradients, 5)
        self.train_op = self.opt.apply_gradients(zip(clip_gradients,
                                                     trainable_params),
                                                 global_step=self.global_step)
Example #38
0
    def build_train_graph(self,
                          inputs,
                          min_depth,
                          max_depth,
                          cube_res,
                          theta_res,
                          phi_res,
                          r_res,
                          scale_factors,
                          num_mpi_planes,
                          learning_rate=0.0001,
                          vgg_model_weights=None,
                          global_step=0,
                          depth_clip=20.0):
        """Construct the training computation graph.

    Args:
      inputs: dictionary of tensors (see 'input_data' below) needed for training
      min_depth: minimum depth for the PSV and MPI planes
      max_depth: maximum depth for the PSV and MPI planes
      cube_res: per-side cube resolution
      theta_res: environment map width
      phi_res: environment map height
      r_res: number of radii to use when sampling spheres for rendering
      scale_factors: downsampling factors of cubes relative to the coarsest
      num_mpi_planes: number of MPI planes to infer
      learning_rate: learning rate
      vgg_model_weights: vgg weights (needed when vgg loss is used)
      global_step: training iteration
      depth_clip: maximum depth for coarsest resampled volumes

    Returns:
      A train_op to be used for training.
    """
        with tf.name_scope('setup'):
            psv_planes = pj.inv_depths(min_depth, max_depth, num_mpi_planes)
            mpi_planes = pj.inv_depths(min_depth, max_depth, num_mpi_planes)

        with tf.name_scope('input_data'):

            tgt_image = inputs['tgt_image']
            ref_image = inputs['ref_image']
            src_images = inputs['src_images']
            env_image = inputs['env_image']

            ref_depth = inputs['ref_depth']

            tgt_pose = inputs['tgt_pose']
            ref_pose = inputs['ref_pose']
            src_poses = inputs['src_poses']
            env_pose = inputs['env_pose']

            intrinsics = inputs['intrinsics']

            _, _, _, num_source = src_poses.get_shape().as_list()

        with tf.name_scope('inference'):
            num_mpi_planes = tf.shape(mpi_planes)[0]
            pred = self.infer_mpi(src_images, ref_image, ref_pose, src_poses,
                                  intrinsics, psv_planes)
            rgba_layers = pred['rgba_layers']
            psv = pred['psv']

        with tf.name_scope('synthesis'):
            output_image, output_alpha_acc, _ = self.mpi_render_view(
                rgba_layers, ref_pose, tgt_pose, mpi_planes, intrinsics)
        with tf.name_scope('environment_rendering'):
            mpi_gt = self.img2mpi(ref_image, ref_depth, mpi_planes)
            output_image_gt, _, _ = self.mpi_render_view(
                mpi_gt, ref_pose, tgt_pose, mpi_planes, intrinsics)

            lightvols_gt, _, _, _, _ = self.predict_lighting_vol(
                mpi_gt,
                mpi_planes,
                intrinsics,
                cube_res,
                scale_factors,
                depth_clip=depth_clip)

            lightvols, lightvol_centers, \
            lightvol_side_lengths, \
            cube_rel_shapes, \
            cube_nest_inds = self.predict_lighting_vol(rgba_layers, mpi_planes,
                                                       intrinsics, cube_res,
                                                       scale_factors,
                                                       depth_clip=depth_clip)

            lightvols_out = nets.cube_net_multires(lightvols, cube_rel_shapes,
                                                   cube_nest_inds)

            gt_envmap, gt_shells = self.render_envmap(
                lightvols_gt, lightvol_centers, lightvol_side_lengths,
                cube_rel_shapes, cube_nest_inds, ref_pose, env_pose, theta_res,
                phi_res, r_res)

            prenet_envmap, prenet_shells = self.render_envmap(
                lightvols, lightvol_centers, lightvol_side_lengths,
                cube_rel_shapes, cube_nest_inds, ref_pose, env_pose, theta_res,
                phi_res, r_res)

            output_envmap, output_shells = self.render_envmap(
                lightvols_out, lightvol_centers, lightvol_side_lengths,
                cube_rel_shapes, cube_nest_inds, ref_pose, env_pose, theta_res,
                phi_res, r_res)

        with tf.name_scope('loss'):
            # mask loss for pixels outside reference frustum
            loss_mask = tf.where(
                tf.equal(output_alpha_acc[Ellipsis, tf.newaxis], 0.0),
                tf.zeros_like(output_image[:, :, :, 0:1]),
                tf.ones_like(output_image[:, :, :, 0:1]))
            loss_mask = tf.stop_gradient(loss_mask)
            tf.summary.image('loss_mask', loss_mask)

            # helper functions for loss
            def compute_error(real, fake, mask):
                mask = tf.ones_like(real) * mask
                return tf.reduce_sum(mask * tf.abs(fake - real)) / (
                    tf.reduce_sum(mask) + 1.0e-8)

            # Normalized VGG loss
            def downsample(tensor, ds):
                return tf.nn.avg_pool(tensor, [1, ds, ds, 1], [1, ds, ds, 1],
                                      'SAME')

            def vgg_loss(tgt_image, output_image, loss_mask, vgg_weights):
                """VGG activation loss definition."""

                vgg_real = nets.build_vgg19(tgt_image * 255.0, vgg_weights)
                rescaled_output_image = output_image * 255.0
                vgg_fake = nets.build_vgg19(rescaled_output_image, vgg_weights)
                p0 = compute_error(vgg_real['input'], vgg_fake['input'],
                                   loss_mask)
                p1 = compute_error(vgg_real['conv1_2'], vgg_fake['conv1_2'],
                                   loss_mask) / 2.6
                p2 = compute_error(vgg_real['conv2_2'], vgg_fake['conv2_2'],
                                   downsample(loss_mask, 2)) / 4.8
                p3 = compute_error(vgg_real['conv3_2'], vgg_fake['conv3_2'],
                                   downsample(loss_mask, 4)) / 3.7
                p4 = compute_error(vgg_real['conv4_2'], vgg_fake['conv4_2'],
                                   downsample(loss_mask, 8)) / 5.6
                p5 = compute_error(vgg_real['conv5_2'], vgg_fake['conv5_2'],
                                   downsample(loss_mask, 16)) * 10 / 1.5
                total_loss = p0 + p1 + p2 + p3 + p4 + p5
                return total_loss

            # rendered image loss
            render_loss = vgg_loss(tgt_image, output_image, loss_mask,
                                   vgg_model_weights) / 100.0
            total_loss = render_loss

            # rendered envmap loss
            envmap_loss = vgg_loss(env_image, output_envmap[Ellipsis, :3],
                                   tf.ones_like(env_image[Ellipsis, 0:1]),
                                   vgg_model_weights) / 100.0

            # set envmap loss to 0 when only training mpi network (see paper)
            envmap_loss = tf.where(tf.greater(global_step, 240000),
                                   envmap_loss, 0.0)

            total_loss += envmap_loss

            # adversarial loss for envmap
            real_logit = nets.discriminator(env_image, scope='discriminator')
            fake_logit = nets.discriminator(output_envmap[Ellipsis, :3],
                                            scope='discriminator')
            adv_loss_list = []
            for i in range(len(fake_logit)):
                adv_loss_list.append(0.1 * -1.0 *
                                     tf.reduce_mean(fake_logit[i][-1]))
            adv_loss = tf.reduce_mean(adv_loss_list)
            real_loss_list = []
            fake_loss_list = []
            for i in range(len(fake_logit)):
                real_loss_list.append(
                    -1.0 *
                    tf.reduce_mean(tf.minimum(real_logit[i][-1] - 1, 0.0)))
                fake_loss_list.append(-1.0 * tf.reduce_mean(
                    tf.minimum(-1.0 * fake_logit[i][-1] - 1, 0.0)))
            real_loss = tf.reduce_mean(real_loss_list)
            fake_loss = tf.reduce_mean(fake_loss_list)
            disc_loss = real_loss + fake_loss

            # set adv/disc losses to 0 until end of training
            adv_loss = tf.where(tf.greater(global_step, 690000), adv_loss, 0.0)
            disc_loss = tf.where(tf.greater(global_step, 690000), disc_loss,
                                 0.0)

            tf.summary.scalar('loss_disc', disc_loss)
            tf.summary.scalar('loss_disc_real', real_loss)
            tf.summary.scalar('loss_disc_fake', fake_loss)
            tf.summary.scalar('loss_adv', adv_loss)

            total_loss += adv_loss

        with tf.name_scope('train_op'):
            train_variables = [
                var for var in tf.trainable_variables()
                if 'discriminator' not in var.name
            ]
            optim = tf.train.AdamOptimizer(learning_rate, epsilon=1e-4)
            grads_and_variables = optim.compute_gradients(
                total_loss, var_list=train_variables)
            grads = [gv[0] for gv in grads_and_variables]
            variables = [gv[1] for gv in grads_and_variables]

            def denan(x):
                return tf.where(tf.is_nan(x), tf.zeros_like(x), x)

            grads_clipped = [denan(g) for g in grads]
            grads_clipped, _ = tf.clip_by_global_norm(grads_clipped, 100.0)
            train_op = [optim.apply_gradients(zip(grads_clipped, variables))]
            tf.summary.scalar('gradient global norm',
                              tf.linalg.global_norm(grads))
            tf.summary.scalar('clipped gradient global norm',
                              tf.linalg.global_norm(grads_clipped))

            d_variables = [
                var for var in tf.trainable_variables()
                if 'discriminator' in var.name
            ]
            optim_d = tf.train.AdamOptimizer(learning_rate, beta1=0.0)
            train_op.append(optim_d.minimize(disc_loss, var_list=d_variables))

        with tf.name_scope('envmap_gt'):
            tf.summary.image('envmap', gt_envmap)
            tf.summary.image('envmap_alpha', gt_envmap[Ellipsis, -1:])
            for i in range(len(gt_shells)):
                i_envmap = pj.over_composite(gt_shells[i])
                tf.summary.image('envmap_level_' + str(i), i_envmap)
        with tf.name_scope('envmap_prenet'):
            tf.summary.image('envmap', prenet_envmap)
            tf.summary.image('envmap_alpha', prenet_envmap[Ellipsis, -1:])
            for i in range(len(prenet_shells)):
                i_envmap = pj.over_composite(prenet_shells[i])
                tf.summary.image('envmap_level_' + str(i), i_envmap)
        with tf.name_scope('envmap_output'):
            tf.summary.image('envmap', output_envmap)
            tf.summary.image('envmap_alpha', output_envmap[Ellipsis, -1:])
            for i in range(len(output_shells)):
                i_envmap = pj.over_composite(output_shells[i])
                tf.summary.image('envmap_level_' + str(i), i_envmap)

        tf.summary.scalar('loss_total', total_loss)
        tf.summary.scalar('loss_render', render_loss)
        tf.summary.scalar('loss_envmap', envmap_loss)
        tf.summary.scalar('min_depth', min_depth)
        tf.summary.scalar('max_depth', max_depth)

        with tf.name_scope('level_stats'):
            for i in range(len(lightvols)):
                tf.summary.scalar('cube_side_length_' + str(i),
                                  lightvol_side_lengths[i])
                tf.summary.scalar('cube_center_' + str(i),
                                  lightvol_centers[i][0, -1])

        # Source images
        for i in range(num_source):
            src_image = src_images[:, :, :, i * 3:(i + 1) * 3]
            tf.summary.image('image_src_%d' % i, src_image)
        # Output image
        tf.summary.image('image_output', output_image)
        tf.summary.image('image_output_Gt', output_image_gt)
        # Target image
        tf.summary.image('image_tgt', tgt_image)
        tf.summary.image('envmap_tgt', env_image)
        # Ref image
        tf.summary.image('image_ref', ref_image)
        # Predicted color and alpha layers, and PSV
        num_summ = 8  # number of plane summaries to show in tensorboard
        for i in range(num_summ):
            ind = tf.to_int32(i * num_mpi_planes / num_summ)
            rgb = rgba_layers[:, :, :, ind, :3]
            alpha = rgba_layers[:, :, :, ind, -1:]
            ref_plane = psv[:, :, :, ind, :3]
            source_plane = psv[:, :, :, ind, 3:6]
            tf.summary.image('layer_rgb_%d' % i, rgb)
            tf.summary.image('layer_alpha_%d' % i, alpha)
            tf.summary.image('layer_rgba_%d' % i, rgba_layers[:, :, :, ind, :])
            tf.summary.image('psv_avg_%d' % i,
                             0.5 * ref_plane + 0.5 * source_plane)
            tf.summary.image('psv_ref_%d' % i, ref_plane)
            tf.summary.image('psv_source_%d' % i, source_plane)

        return train_op
Example #39
0
    def __init__(self, num_gpus=1, res_block_nums=7):
        #         self.ckpt = os.path.join(os.getcwd(), 'models/best_model.ckpt-13999')    # TODO
        self.num_gpus = num_gpus
        self.save_dir = "./gpu_models"
        self.is_logging = True
        self.res_block_nums = res_block_nums
        """reset TF Graph"""
        tf.reset_default_graph()
        """Creat a new graph for the network"""
        # g = tf.Graph()

        config = tf.ConfigProto(inter_op_parallelism_threads=4,
                                intra_op_parallelism_threads=4)
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        """Assign a Session that excute the network"""
        # config.gpu_options.per_process_gpu_memory_fraction = 0.75
        # self.sess = tf.Session(config=config, graph=g)

        # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.75)
        # config = tf.ConfigProto(gpu_options=gpu_options)
        self.sess = tf.Session(config=config)
        # self.sess = tf.InteractiveSession()

        with tf.device('/cpu:0'):
            # Variables
            self.filters_size = 128  # or 256
            self.prob_size = 2086
            self.digest = None
            self.training = tf.placeholder(tf.bool, name='training')
            self.inputs_ = tf.placeholder(
                tf.float32, [None, 9, 10, 14],
                name='inputs')  # + 2    # TODO C plain x 2
            self.c_l2 = 0.0001
            self.momentum = 0.9
            self.global_norm = 100
            self.learning_rate = tf.placeholder(
                tf.float32,
                name='learning_rate')  #0.001    #5e-3    #0.05    #

            self.global_step = tf.Variable(0,
                                           name="global_step",
                                           trainable=False)
            # self.learning_rate = tf.maximum(tf.train.exponential_decay(
            #     0.001, self.global_step, 1e3, 0.66), 1e-5)
            # self.learning_rate = tf.Variable(self.hps.lrn_rate, dtype=tf.float32, trainable=False)
            tf.summary.scalar('learning_rate', self.learning_rate)

            # 优化损失
            optimizer = tf.train.MomentumOptimizer(
                learning_rate=self.learning_rate,
                momentum=self.momentum,
                use_nesterov=True)  # , use_locking=True
            # optimizer = tf.train.AdamOptimizer(self.learning_rate)

            # First block
            self.pi_ = tf.placeholder(tf.float32, [None, self.prob_size],
                                      name='pi')
            self.z_ = tf.placeholder(tf.float32, [None, 1], name='z')

            # batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue([self.inputs_, self.pi_, self.z_], capacity=3 * self.num_gpus)

            inputs_batches = tf.split(self.inputs_, self.num_gpus, axis=0)
            pi_batches = tf.split(self.pi_, self.num_gpus, axis=0)
            z_batches = tf.split(self.z_, self.num_gpus, axis=0)

            tower_grads = [None] * self.num_gpus

            self.loss = 0
            self.accuracy = 0
            self.policy_head = []
            self.value_head = []

            with tf.variable_scope(tf.get_variable_scope()):
                """Build the core model within the graph."""
                for i in range(self.num_gpus):
                    with tf.device(
                            self.assign_to_device(
                                '/gpu:{}'.format(i),
                                ps_device='/cpu:0')):  #tf.device('/gpu:{i}'):
                        with tf.name_scope('TOWER_{}'.format(i)) as scope:
                            inputs_batch, pi_batch, z_batch = inputs_batches[
                                i], pi_batches[i], z_batches[
                                    i]  # batch_queue.dequeue() #
                            # NWHC format
                            # batch, 9 * 10, 14 channels
                            # inputs_ = tf.reshape(self.inputs_, [-1, 9, 10, 14])
                            loss = self.tower_loss(inputs_batch, pi_batch,
                                                   z_batch, i)
                            # reuse variable happens here
                            tf.get_variable_scope().reuse_variables()
                            grad = optimizer.compute_gradients(loss)
                            tower_grads[i] = grad

            self.loss /= self.num_gpus
            self.accuracy /= self.num_gpus
            grads = self.average_gradients(tower_grads)
            # defensive step 2 to clip norm
            clipped_grads, self.norm = tf.clip_by_global_norm(
                [g for g, _ in grads], self.global_norm)

            # defensive step 3 check NaN
            # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating
            grad_check = [
                tf.check_numerics(g, message='NaN Found!')
                for g in clipped_grads
            ]
            with tf.control_dependencies(grad_check):
                self.train_op = optimizer.apply_gradients(
                    zip(clipped_grads, [v for _, v in grads]),
                    global_step=self.global_step,
                    name='train_step')

            if self.is_logging:
                for grad, var in grads:
                    if grad is not None:
                        tf.summary.histogram(var.op.name + '/gradients', grad)
                for var in tf.trainable_variables():
                    tf.summary.histogram(var.op.name, var)

            self.summaries_op = tf.summary.merge_all()
            # Train Summaries
            self.train_writer = tf.summary.FileWriter(
                os.path.join(os.getcwd(), "cchesslogs/train"), self.sess.graph)

            # Test summaries
            self.test_writer = tf.summary.FileWriter(
                os.path.join(os.getcwd(), "cchesslogs/test"), self.sess.graph)

            self.sess.run(tf.global_variables_initializer())
            #         self.sess.run(tf.local_variables_initializer())
            #         self.sess.run(tf.initialize_all_variables())
            self.saver = tf.train.Saver()
            self.train_restore()
Example #40
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 hidden_size,
                 embedding_size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 forward_only=False):

        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # Create the internal multi-layer cell for our RNN.
        single_cell = tf.nn.rnn_cell.GRUCell(hidden_size)
        if use_lstm:
            single_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
        cell = single_cell
        if num_layers > 1:
            cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return tf.nn.seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=source_vocab_size,
                num_decoder_symbols=target_vocab_size,
                embedding_size=embedding_size,
                feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets(
            self.encoder_inputs, self.decoder_inputs, targets,
            self.target_weights, buckets,
            lambda x, y: seq2seq_f(x, y, forward_only))

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables(), max_to_keep=1000)
Example #41
0
    def __init__(self, s_size, a_size, scope, trainer):
        with tf.variable_scope(scope):
            # Input and visual encoding layers
            self.inputs = tf.placeholder(shape=[None, s_size],
                                         dtype=tf.float32)
            self.imageIn = tf.reshape(self.inputs,
                                      shape=[-1, s_shape[0], s_shape[1], 1])
            self.conv1 = slim.conv2d(activation_fn=tf.nn.elu,
                                     inputs=self.imageIn,
                                     num_outputs=16,
                                     kernel_size=[2, 2],
                                     stride=[1, 1],
                                     padding='SAME')
            self.conv2 = slim.conv2d(activation_fn=tf.nn.elu,
                                     inputs=self.conv1,
                                     num_outputs=32,
                                     kernel_size=[2, 2],
                                     stride=[1, 1],
                                     padding='SAME')
            hidden = slim.fully_connected(slim.flatten(self.conv2),
                                          256,
                                          activation_fn=tf.nn.elu)

            # Recurrent network for temporal dependencies
            lstm_cell = tf.contrib.rnn.BasicLSTMCell(256, state_is_tuple=True)
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
            self.state_init = [c_init, h_init]
            c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
            h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
            self.state_in = (c_in, h_in)
            rnn_in = tf.expand_dims(hidden, [0])
            step_size = tf.shape(self.imageIn)[:1]
            state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                lstm_cell,
                rnn_in,
                initial_state=state_in,
                sequence_length=step_size,
                time_major=False)
            lstm_c, lstm_h = lstm_state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(lstm_outputs, [-1, 256])

            # Output layers for policy and value estimations
            self.policy = slim.fully_connected(
                rnn_out,
                a_size,
                activation_fn=tf.nn.softmax,
                weights_initializer=normalized_columns_initializer(0.01),
                biases_initializer=None)
            self.value = slim.fully_connected(
                rnn_out,
                1,
                activation_fn=None,
                weights_initializer=normalized_columns_initializer(1.0),
                biases_initializer=None)

            # Only the worker network need ops for loss functions and gradient updating.
            if scope != 'global':
                self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
                self.actions_onehot = tf.one_hot(self.actions,
                                                 a_size,
                                                 dtype=tf.float32)
                self.target_v = tf.placeholder(shape=[None], dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],
                                                 dtype=tf.float32)

                self.responsible_outputs = tf.reduce_sum(
                    self.policy * self.actions_onehot, [1])

                # Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(
                    tf.square(self.target_v - tf.reshape(self.value, [-1])))
                self.entropy = -tf.reduce_sum(
                    self.policy * tf.log(self.policy))
                self.policy_loss = -tf.reduce_sum(
                    tf.log(self.responsible_outputs) * self.advantages)
                self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01

                # Get gradients from local network using local losses
                local_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(
                    self.gradients, 40.0)

                # Apply local gradients to global network
                global_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(
                    zip(grads, global_vars))
Example #42
0
    def __init__(self, is_training, config):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        size = config.hidden_size
        vocab_size = config.vocab_size

        self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self._targets = tf.placeholder(tf.int32, [batch_size, o.numClass])

        # Slightly better results can be obtained with forget gate biases
        # initialized to 1 but the hyperparameters of the model would need to be
        # different than reported in the paper.
        lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
        if is_training and config.keep_prob < 1:
            lstm_cell = rnn_cell.DropoutWrapper(
                lstm_cell, output_keep_prob=config.keep_prob)
        cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers)

        self._initial_state = cell.zero_state(batch_size, tf.float32)

        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, size])
            inputs = tf.split(
                1, num_steps,
                tf.nn.embedding_lookup(embedding, self._input_data))
            inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        if is_training and config.keep_prob < 1:
            inputs = [
                tf.nn.dropout(input_, config.keep_prob) for input_ in inputs
            ]

        # Simplified version of tensorflow.models.rnn.rnn.py's rnn().
        # This builds an unrolled LSTM for tutorial purposes only.
        # In general, use the rnn() or state_saving_rnn() from rnn.py.
        #
        # The alternative version of the code below is:
        #
        # from tensorflow.models.rnn import rnn
        # outputs, states = rnn.rnn(cell, inputs, initial_state=self._initial_state)
        outputs = []
        states = []
        state = self._initial_state

        with tf.variable_scope("RNN"):
            for time_step, input_ in enumerate(inputs):
                if time_step > 0: tf.get_variable_scope().reuse_variables()

                (cell_output, state) = cell(input_, state)
                outputs.append(cell_output)
                states.append(state)

        output = tf.reshape(tf.concat(1, outputs), [-1, size])

        logits = tf.nn.xw_plus_b(
            output, tf.get_variable("softmax_w", [size, vocab_size]),
            tf.get_variable("softmax_b", [vocab_size]))
        loss = seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(self._targets, [-1])],
            [tf.ones([batch_size * num_steps])], vocab_size)
        self._cost = cost = tf.div(tf.reduce_sum(loss), batch_size)
        self._final_state = states[-1]

        if not is_training:
            return

        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))
Example #43
0
    def __init__(self, is_training, config):
        seq_width = config.seq_width
        n_steps = config.batch_size
        num_hidden = config.num_hidden
        num_layers = config.num_layers

        #tensors for input, target and sequence length placeholders
        self._seq_input = tf.placeholder(tf.float32, [n_steps, seq_width])
        self._seq_target = tf.placeholder(tf.float32, [n_steps, 1])
        self._early_stop = tf.placeholder(tf.int32)

        #inputs should be a list of tensors at each timestamp
        inputs = [
            tf.reshape(data, (1, seq_width))
            for data in tf.split(0, n_steps, self.seq_input)
        ]
        initializer = tf.random_uniform_initializer(-.1, .1)

        cell = rnn_cell.LSTMCell(num_hidden,
                                 seq_width,
                                 initializer=initializer)
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell([cell] * num_layers)

        #initial state
        self._initial_state = cell.zero_state(1, tf.float32)

        outputs, states = rnn(cell,
                              inputs,
                              initial_state=self._initial_state,
                              sequence_length=self._early_stop)

        #save final state of the rnn
        self._final_state = states[-1]

        #outputs originaly comes as a list of tensors, but we need a single tensor for tf.matmul
        outputs = tf.reshape(tf.concat(1, outputs), [-1, num_hidden])

        #rnn outputs
        W = tf.get_variable('W', [num_hidden, 1])
        b = tf.get_variable('b', [1])
        _output = tf.matmul(outputs, W) + b
        self._output = _output

        #ops for least squares error computation
        error = tf.pow(
            tf.reduce_sum(tf.pow(tf.sub(_output, self._seq_target), 2)), .5)
        tf.scalar_summary("error", error)

        self._error = error
        self._merge_summaries_op = tf.merge_all_summaries()

        if not is_training:
            return

        #learning rate
        self._lr = tf.Variable(0., trainable='False', name='lr')

        #trainable variables for gradient computation
        tvars = tf.trainable_variables()
        #compute gradients
        grads, _ = tf.clip_by_global_norm(tf.gradients(self._error, tvars),
                                          config.max_grad_norm)

        #2 options here: either to use GradientDescentOptimizer (config.useGDO:True) or AdamOptimizer (config.useGDO:False)
        if config.useGDO:
            optimizer = tf.train.GradientDescentOptimizer(self._lr)
        else:
            optimizer = tf.train.AdamOptimizer(self._lr)

        #ops for training
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))
Example #44
0
def main():
    args = get_args()
    if args.l2_reg_strength == 0:
        args.l2_reg_strength = None
    logdir = os.path.join(args.logdir_root, 'train')
    coord = tf.train.Coordinator()

    # Create inputs
    with tf.name_scope('create_inputs'):
        reader = AudioReader(args.nb_data_dir,
                             args.wb_data_dir,
                             coord,
                             sample_rate=args.sample_rate,
                             sample_size=args.sample_size,
                             silence_threshold=args.silence_threshold)
        nb_audio_batch, wb_audio_batch = reader.dequeue(args.batch_size)

    # Create model
    net = create_model(args)
    global_step = tf.get_variable('global_step', [],
                                  initializer=tf.constant_initializer(0),
                                  trainable=False)
    optim = optimizer_factory[args.optimizer](learning_rate=args.learning_rate,
                                              momentum=args.momentum)

    # Set up placeholders and variables on each GPU
    tower_grads = []
    losses = []
    wb_input_batch_rnn = []
    nb_input_batch_rnn = []
    train_big_frame_state = []
    train_frame_state = []
    final_big_frame_state = []
    final_frame_state = []
    # --- ADDED ---
    '''
    target = []
    prediction = []
    '''
    # -------------
    for i in xrange(args.num_gpus):
        with tf.device('/gpu:%d' % (i)):
            '''
            target.append(
                tf.Variable(tf.zeros([net.seq_len * net.batch_size, 256]),
                            trainable=False,
                            name='target_output_rnn',
                            dtype=tf.float32))
            prediction.append(
                tf.Variable(tf.zeros([net.seq_len * net.batch_size, 256]),
                            trainable=False,
                            name='prediction',
                            dtype=tf.float32))
            '''
            # Create input placeholders
            nb_input_batch_rnn.append(
                tf.Variable(tf.zeros([net.batch_size, net.seq_len, 1]),
                            trainable=False,
                            name='nb_input_batch_rnn',
                            dtype=tf.float32))
            wb_input_batch_rnn.append(
                tf.Variable(tf.zeros([net.batch_size, net.seq_len, 1]),
                            trainable=False,
                            name='wb_input_batch_rnn',
                            dtype=tf.float32))
            # Create initial states
            train_big_frame_state.append(
                net.big_cell.zero_state(net.batch_size, tf.float32))
            final_big_frame_state.append(
                net.big_cell.zero_state(net.batch_size, tf.float32))
            train_frame_state.append(
                net.cell.zero_state(net.batch_size, tf.float32))
            final_frame_state.append(
                net.cell.zero_state(net.batch_size, tf.float32))

    with tf.variable_scope(tf.get_variable_scope()):
        for i in xrange(args.num_gpus):
            with tf.device('/gpu:%d' % (i)):
                with tf.name_scope('TOWER_%d' % i) as scope:
                    # Create variables
                    print("Creating model on GPU:%d" % i)
                    loss, final_big_frame_state[i], final_frame_state[i] = \
                        net.loss_HRNN(nb_input_batch_rnn[i],
                                      wb_input_batch_rnn[i],
                                      train_big_frame_state[i],
                                      train_frame_state[i],
                                      l2_reg_strength=args.l2_reg_strength)
                    tf.get_variable_scope().reuse_variables()
                    losses.append(loss)
                    # Reuse variables for the next tower
                    trainable = tf.trainable_variables()
                    gradients = optim.compute_gradients(
                        loss,
                        trainable,
                        aggregation_method=tf.AggregationMethod.
                        EXPERIMENTAL_ACCUMULATE_N)
                    tower_grads.append(gradients)
    grad_vars = average_gradients(tower_grads)
    grads, vars = zip(*grad_vars)
    grads_clipped, _ = tf.clip_by_global_norm(grads, 5.0)
    grad_vars = zip(grads_clipped, vars)

    apply_gradient_op = optim.apply_gradients(grad_vars,
                                              global_step=global_step)

    # -----------------------------------------------------------------------
    # Start/continue training
    # -----------------------------------------------------------------------
    writer = tf.summary.FileWriter(logdir)
    writer.add_graph(tf.get_default_graph())
    summaries = tf.summary.merge_all()

    # Configure session
    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    sess = tf.Session(config=tf_config)
    init = tf.global_variables_initializer()
    sess.run(init)

    # Load checkpoint
    saver = tf.train.Saver(var_list=tf.trainable_variables(),
                           max_to_keep=args.max_checkpoints)
    try:
        saved_global_step = load(saver, sess, logdir)
        if saved_global_step is None: saved_global_step = -1
    except:
        print("Something went wrong while restoring checkpoint.")
        raise

    # Start queue runners
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    reader.start_threads(sess)

    # Train
    step = None
    last_saved_step = saved_global_step
    infe_para = create_gen_wav_para(net)
    try:
        for step in range(saved_global_step + 1, args.num_steps):
            final_big_s = []
            final_s = []
            for g in xrange(args.num_gpus):
                # Initialize cells
                final_big_s.append(sess.run(net.big_initial_state))
                final_s.append(sess.run(net.initial_state))
                start_time = time.time()

            nb_inputs_list = []
            wb_inputs_list = []
            for _ in xrange(args.num_gpus):
                # Get input batches
                nb_inputs, wb_inputs = sess.run(
                    [nb_audio_batch, wb_audio_batch])
                nb_inputs_list.append(nb_inputs)
                wb_inputs_list.append(wb_inputs)

            loss_sum = 0
            idx_begin = 0
            audio_length = args.sample_size - args.big_frame_size
            bptt_length = args.seq_len - args.big_frame_size
            stateful_rnn_length = audio_length / bptt_length
            output_list = [
                summaries, losses, apply_gradient_op, final_big_frame_state,
                final_frame_state
            ]

            for i in range(0, stateful_rnn_length):
                inp_dict = {}
                for g in xrange(args.num_gpus):
                    # Add seq_len samples as input for truncated BPTT
                    inp_dict[nb_input_batch_rnn[g]] = \
                        nb_inputs_list[g][:, idx_begin:idx_begin+args.seq_len, :]
                    inp_dict[wb_input_batch_rnn[g]] = \
                        wb_inputs_list[g][:, idx_begin:idx_begin+args.seq_len, :]
                    inp_dict[train_big_frame_state[g]] = final_big_s[g]
                    inp_dict[train_frame_state[g]] = final_s[g]
                idx_begin += args.seq_len - args.big_frame_size

                # Forward pass
                summary, loss_gpus, _, final_big_s, final_s = \
                    sess.run(output_list,
                             feed_dict=inp_dict)

                writer.add_summary(summary, step)
                for g in xrange(args.num_gpus):
                    loss_gpu = loss_gpus[g] / stateful_rnn_length
                    loss_sum += loss_gpu / args.num_gpus
            duration = time.time() - start_time
            print("Step {:d}: loss = {:.3f}, ({:.3f} sec/step)".format(
                step, loss_sum, duration))

            if step % args.ckpt_every == 0:
                save(saver, sess, logdir, step)
                last_saved_step = step

            # Generate waveforms every 20 steps
            #if (step) % 20 == 0 and step >= 20:
            generate_and_save_samples(step, net, infe_para, sess,
                                      nb_inputs_list[0])

    except KeyboardInterrupt:
        print()
    finally:
        if step > last_saved_step:
            print('Saving model...')
            save(saver, sess, logdir, step)
        coord.request_stop()
        coord.join(threads)
Example #45
0
    def __init__(self, embedding_size, rnn_size, layer_size, vocab_size,
                 attn_size, sequence_length, n_classes, grad_clip,
                 learning_rate):
        """
		- embedding_size: word embedding dimension
		- rnn_size : hidden state dimension
		- layer_size : number of rnn layers
		- vocab_size : vocabulary size
		- attn_size : attention layer dimension
		- sequence_length : max sequence length
		- n_classes : number of target labels
		- grad_clip : gradient clipping threshold
		- learning_rate : initial learning rate
		"""

        self.output_keep_prob = tf.placeholder(tf.float32,
                                               name='output_keep_prob')
        self.input_data = tf.placeholder(tf.int32,
                                         shape=[None, sequence_length],
                                         name='input_data')
        self.targets = tf.placeholder(tf.float32,
                                      shape=[None, n_classes],
                                      name='targets')

        # 定义前向RNN Cell
        with tf.name_scope('fw_rnn'), tf.variable_scope('fw_rnn'):
            print tf.get_variable_scope().name
            lstm_fw_cell_list = [
                tf.contrib.rnn.LSTMCell(rnn_size) for _ in xrange(layer_size)
            ]
            lstm_fw_cell_m = tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.MultiRNNCell(lstm_fw_cell_list),
                output_keep_prob=self.output_keep_prob)

        # 定义反向RNN Cell
        with tf.name_scope('bw_rnn'), tf.variable_scope('bw_rnn'):
            print tf.get_variable_scope().name
            lstm_bw_cell_list = [
                tf.contrib.rnn.LSTMCell(rnn_size) for _ in xrange(layer_size)
            ]
            lstm_bw_cell_m = tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.MultiRNNCell(lstm_fw_cell_list),
                output_keep_prob=self.output_keep_prob)

        with tf.device('/cpu:0'):
            embedding = tf.Variable(tf.truncated_normal(
                [vocab_size, embedding_size], stddev=0.1),
                                    name='embedding')
            inputs = tf.nn.embedding_lookup(embedding, self.input_data)

        # self.input_data shape: (batch_size , sequence_length)
        # inputs shape : (batch_size , sequence_length , rnn_size)

        # bidirection rnn 的inputs shape 要求是(sequence_length, batch_size, rnn_size)
        # 因此这里需要对inputs做一些变换
        # 经过transpose的转换已经将shape变为(sequence_length, batch_size, rnn_size)
        # 只是双向rnn接受的输入必须是一个list,因此还需要后续两个步骤的变换
        inputs = tf.transpose(inputs, [1, 0, 2])
        # 转换成(batch_size * sequence_length, rnn_size)
        inputs = tf.reshape(inputs, [-1, rnn_size])
        # 转换成list,里面的每个元素是(batch_size, rnn_size)
        inputs = tf.split(inputs, sequence_length, 0)

        with tf.name_scope('bi_rnn'), tf.variable_scope('bi_rnn'):
            outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(
                lstm_fw_cell_m, lstm_bw_cell_m, inputs, dtype=tf.float32)

        # 定义attention layer
        attention_size = attn_size
        with tf.name_scope('attention'), tf.variable_scope('attention'):
            attention_w = tf.Variable(tf.truncated_normal(
                [2 * rnn_size, attention_size], stddev=0.1),
                                      name='attention_w')
            attention_b = tf.Variable(tf.constant(0.1, shape=[attention_size]),
                                      name='attention_b')
            u_list = []
            for t in xrange(sequence_length):
                u_t = tf.tanh(tf.matmul(outputs[t], attention_w) + attention_b)
                u_list.append(u_t)
            u_w = tf.Variable(tf.truncated_normal([attention_size, 1],
                                                  stddev=0.1),
                              name='attention_uw')
            attn_z = []
            for t in xrange(sequence_length):
                z_t = tf.matmul(u_list[t], u_w)
                attn_z.append(z_t)
            # transform to batch_size * sequence_length
            attn_zconcat = tf.concat(attn_z, axis=1)
            self.alpha = tf.nn.softmax(attn_zconcat)
            # transform to sequence_length * batch_size * 1 , same rank as outputs
            alpha_trans = tf.reshape(tf.transpose(self.alpha, [1, 0]),
                                     [sequence_length, -1, 1])
            self.final_output = tf.reduce_sum(outputs * alpha_trans, 0)

        print self.final_output.shape
        # outputs shape: (sequence_length, batch_size, 2*rnn_size)
        fc_w = tf.Variable(tf.truncated_normal([2 * rnn_size, n_classes],
                                               stddev=0.1),
                           name='fc_w')
        fc_b = tf.Variable(tf.zeros([n_classes]), name='fc_b')

        #self.final_output = outputs[-1]

        # 用于分类任务, outputs取最终一个时刻的输出
        self.logits = tf.matmul(self.final_output, fc_w) + fc_b
        self.prob = tf.nn.softmax(self.logits)

        self.cost = tf.losses.softmax_cross_entropy(self.targets, self.logits)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                          grad_clip)

        optimizer = tf.train.AdamOptimizer(learning_rate)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
        self.accuracy = tf.reduce_mean(
            tf.cast(
                tf.equal(tf.argmax(self.targets, axis=1),
                         tf.argmax(self.prob, axis=1)), tf.float32))
Example #46
0
    def __init__(self,
                 word_embedding,
                 word_to_idx=None,
                 use_glove=True,
                 is_training=True,
                 dim_feat=2048,
                 config=Config(),
                 num_input=2):
        self.x = tf.placeholder(tf.int32, [None, config.num_steps])
        self.y_ = tf.placeholder(tf.float32, [None, 2])
        self.img_feat = tf.placeholder(tf.float32, [None, dim_feat])
        self.lr = tf.placeholder(tf.float32)
        self._eos = word_to_idx['<eos>']
        mask = tf.to_float(tf.equal(self.x, self._eos))

        num_steps = config.num_steps
        hidden_size = config.hidden_size
        vocab_size = config.vocab_size
        embedding_size = config.embedding_size
        num_input = config.num_input
        use_img_feat = config.use_img_feat
        use_lstm = config.use_lstm
        combine_typ = config.combine_typ
        cls_hidden = config.cls_hidden
        use_residual = config.use_residual

        img_feat = tf.layers.dense(inputs=self.img_feat,
                                   units=hidden_size,
                                   activation=None)

        if use_residual:

            def lstm_cell():
                return ResidualWrapper(
                    tf.contrib.rnn.BasicLSTMCell(hidden_size,
                                                 forget_bias=1.0,
                                                 state_is_tuple=True))
        else:

            def lstm_cell():
                return tf.contrib.rnn.BasicLSTMCell(hidden_size,
                                                    forget_bias=1.0,
                                                    state_is_tuple=True)

        attn_cell = lstm_cell
        if is_training and config.dropout_prob < 1:

            def attn_cell():
                return tf.contrib.rnn.DropoutWrapper(
                    lstm_cell(), output_keep_prob=config.dropout_prob)

        cell = tf.contrib.rnn.MultiRNNCell(
            [attn_cell() for _ in xrange(config.num_layers)],
            state_is_tuple=True)

        if use_glove:
            embedding = tf.get_variable(
                "embedding",
                dtype=tf.float32,
                initializer=tf.constant(word_embedding))
        else:
            embedding = tf.get_variable(
                "embedding", [vocab_size, embedding_size],
                initializer=tf.random_uniform_initializer(minval=-1.0,
                                                          maxval=1.0))
        inputs = tf.nn.embedding_lookup(embedding, self.x)

        if use_img_feat == 'concat_bf_lstm':
            raise Exception("use_img_feat=concat_bf_lstm not supported")
            img_reshape = tf.reshape(img_feat, [-1, 1, dim_feat])
            img_tiled = tf.tile(img_reshape, [1, num_steps, 1])
            inputs = tf.concat([inputs, img_tiled], 2)

        if is_training and config.dropout_prob < 1:
            inputs = tf.nn.dropout(inputs, config.dropout_prob)

        if use_lstm:
            ta_d_outputs = tf.TensorArray(dtype=tf.float32,
                                          size=num_steps,
                                          dynamic_size=False,
                                          infer_shape=True)

            state = cell.zero_state(tf.shape(inputs)[0], tf.float32)
            with tf.variable_scope("RNN"):
                for time_step in xrange(num_steps):
                    if time_step > 0:
                        tf.get_variable_scope().reuse_variables()
                    (output, state) = cell(inputs[:, time_step, :], state)
                    ta_d_outputs = ta_d_outputs.write(time_step, output)

                # batch_size x seq_length x hidden_size
                ta_d_outputs = tf.transpose(ta_d_outputs.stack(),
                                            perm=[1, 0, 2])

                # apply the mask
                mask = tf.expand_dims(mask, -1)
                mask = tf.tile(mask, tf.stack([1, 1, hidden_size]))
                masked_out = ta_d_outputs * mask
                output = tf.reduce_sum(masked_out, axis=1)
                output_context, output_candidate = tf.split(
                    output, num_or_size_splits=num_input, axis=0)
        else:
            inputs = tf.reshape(inputs, [-1, num_steps * embedding_size])
            output_context, output_candidate = tf.split(
                inputs, num_or_size_splits=num_input, axis=0)

        print("-" * 80)
        if use_img_feat == 'concat_af_lstm':
            print(
                "Image feature concatenate after the contextfeature from LSTM")
            imgf_1, imgf_2 = tf.split(img_feat,
                                      num_or_size_splits=num_input,
                                      axis=0)
            output_context = tf.concat([imgf_1, output_context], axis=1)
        elif use_img_feat == 'only_img':
            print("Image Feature Replacing the Context Feature from LSTM")
            imgf_1, imgf_2 = tf.split(img_feat,
                                      num_or_size_splits=num_input,
                                      axis=0)
            output_context = imgf_1
        else:
            print("Not using image feature")
        print("-" * 80)

        # Combining candidate information with context information
        print("-" * 80)
        if combine_typ == 'concat':
            print("Directly concatenate context and candidate feature.")
            output = tf.concat([output_context, output_candidate], axis=1)
        elif combine_typ == 'bilinpool':  # compact bilinear
            print(
                "Use compact bilinear pooling between candidate/context features."
            )
            out_dim = 8192
            output_context = tf.expand_dims(tf.expand_dims(output_context, 1),
                                            1)
            output_candidate = tf.expand_dims(
                tf.expand_dims(output_candidate, 1), 1)
            output = compact_bilinear_pooling(output_context, output_candidate,
                                              out_dim)
            output = tf.reshape(output,
                                [-1, out_dim])  # make static time shape
        else:
            print("Use only the candidate feature.")
            output = output_candidate
        print("-" * 80)

        for _ in range(cls_hidden):
            output = tf.layers.dense(inputs=output,
                                     units=512,
                                     activation=tf.nn.relu)
            if is_training and config.dropout_prob < 1:
                output = tf.nn.dropout(output, config.dropout_prob)

        y = tf.layers.dense(inputs=output, units=2, activation=None)

        score = tf.nn.softmax(y, dim=-1, name=None)

        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(labels=self.y_, logits=y))

        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(self.y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

        self._logits = y
        self._score = score
        self._loss = loss
        self._accuracy = accuracy

        if not is_training:
            return

        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars),
                                          config.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())
    def __init__(self, worker_index, env, render, num_local_steps,
                 learning_rate, entropy_regularization, max_gradient_norm,
                 discount, summary_writer, summary_update_interval):
        """An agent that learns to plan in gridworld using an A3C architecture.
        Args:
            worker_index: Index of the worker thread that is running this agent.
            env: A simulator object (see in /Env') that wraps over a pygame environment.
            render: Determines whether to display the game screen.
            num_local_steps: Number of experiences used per worker when updating the model.
            learning_rate: The speed with which the network learns from new examples.
            entropy_regularization: The strength of the entropy regularization term.
            max_gradient_norm: Maximum value allowed for the L2-norms of gradients. Gradients with
                norms that would otherwise surpass this value are scaled down. ?
            discount: Discount factor for future rewards.
            summary_writer: A TensorFlow object that writes summaries.
            summary_update_interval: Number of training steps needed to update the summary data.
        """

        self.worker_index = worker_index
        self.env = env
        self.render = render
        self.num_local_steps = num_local_steps
        self.discount = discount
        self.summary_writer = summary_writer
        self.summary_update_interval = summary_update_interval
        self.num_times_trained = 0

        device = "cpu"
        if USE_GPU:
            device = "gpu"
        worker_device = '/job:thread/task:{}/{}:0'.format(worker_index, device)

        # Get global parameters
        with tf.device(
                tf.train.replica_device_setter(1, '/job:master',
                                               worker_device)):
            # ps_tasks, ps_device, worker_device
            with tf.variable_scope('global'):
                self.global_network = a3c.PolicyNetwork()
                self.global_step = tf.get_variable('global_step', [],
                                                   tf.int32,
                                                   tf.constant_initializer(
                                                       0, tf.int32),
                                                   trainable=False)

        with tf.device(worker_device):
            with tf.variable_scope('local'):
                self.local_network = a3c.PolicyNetwork()
                self.local_network.global_step = self.global_step

            self.action = tf.placeholder(tf.int32, [None], 'Action')
            self.advantage = tf.placeholder(tf.float32, [None], 'Advantage')
            self.discounted_reward = tf.placeholder(tf.float32, [None],
                                                    'Discounted_Reward')

            # Estimate the policy loss using the cross-entropy loss function.
            action_logits = self.local_network.action_logits
            # policy_loss part I: policy gradient
            policy_loss = tf.reduce_sum(
                self.advantage *
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=action_logits, labels=self.action))
            # Regularize the policy loss by adding uncertainty (subtracting entropy). High entropy means
            # the agent is uncertain (meaning, it assigns similar probabilities to multiple actions).
            # Low entropy means the agent is sure of which action it should perform next.
            entropy = -tf.reduce_sum(
                tf.nn.softmax(action_logits) *
                tf.nn.log_softmax(action_logits))
            # policy_loss part II: entropy loss
            policy_loss -= entropy_regularization * entropy

            # Estimate the value loss using the sum of squared errors.
            value_loss = tf.nn.l2_loss(self.local_network.value -
                                       self.discounted_reward)

            # Estimate the final loss.
            self.loss = policy_loss + 0.5 * value_loss

            # Fetch and clip the gradients of the local network.
            gradients = tf.gradients(self.loss, self.local_network.parameters)
            clipped_gradients, _ = tf.clip_by_global_norm(
                gradients, max_gradient_norm)

            # Update the global network using the clipped gradients.
            batch_size = tf.shape(self.local_network.s)[0]
            grads_and_vars = list(
                zip(clipped_gradients, self.global_network.parameters))
            self.train_step = [
                tf.train.AdamOptimizer(learning_rate).apply_gradients(
                    grads_and_vars),
                self.global_step.assign_add(batch_size)
            ]

            # Synchronize the local network with the global network.
            self.reset_local_network = [
                local_p.assign(global_p)
                for local_p, global_p in zip(self.local_network.parameters,
                                             self.global_network.parameters)
            ]

            tf.summary.scalar('model/loss',
                              self.loss / tf.to_float(batch_size))
            tf.summary.scalar('model/policy_loss',
                              policy_loss / tf.to_float(batch_size))
            tf.summary.scalar('model/value_loss',
                              value_loss / tf.to_float(batch_size))
            tf.summary.scalar('model/entropy',
                              entropy / tf.to_float(batch_size))
            tf.summary.scalar('model/global_norm',
                              tf.global_norm(self.local_network.parameters))
            tf.summary.scalar('model/gradient_global_norm',
                              tf.global_norm(gradients))
            self.summary_step = tf.summary.merge_all()
Example #48
0
def train(train_dir,
          config,
          dataset_fn,
          checkpoints_to_keep=5,
          keep_checkpoint_every_n_hours=1,
          num_steps=None,
          master='',
          num_sync_workers=0,
          num_ps_tasks=0,
          task=0):
    """Train loop."""
    tf.gfile.MakeDirs(train_dir)
    is_chief = (task == 0)
    if is_chief:
        _trial_summary(config.hparams, config.train_examples_path, train_dir)
    with tf.Graph().as_default():
        with tf.device(
                tf.train.replica_device_setter(num_ps_tasks,
                                               merge_devices=True)):

            model = config.model
            model.build(config.hparams,
                        config.data_converter.output_depth,
                        is_training=True)

            optimizer = model.train(**_get_input_tensors(dataset_fn(), config))

            hooks = []
            if num_sync_workers:
                optimizer = tf.train.SyncReplicasOptimizer(
                    optimizer, num_sync_workers)
                hooks.append(optimizer.make_session_run_hook(is_chief))

            grads, var_list = zip(*optimizer.compute_gradients(model.loss))
            global_norm = tf.global_norm(grads)
            tf.summary.scalar('global_norm', global_norm)

            if config.hparams.clip_mode == 'value':
                g = config.hparams.grad_clip
                clipped_grads = [
                    tf.clip_by_value(grad, -g, g) for grad in grads
                ]
            elif config.hparams.clip_mode == 'global_norm':
                clipped_grads = tf.cond(
                    global_norm < config.hparams.grad_norm_clip_to_zero,
                    lambda: tf.clip_by_global_norm(  # pylint:disable=g-long-lambda
                        grads,
                        config.hparams.grad_clip,
                        use_norm=global_norm)[0],
                    lambda: [tf.zeros(tf.shape(g)) for g in grads])
            else:
                raise ValueError('Unknown clip_mode: {}'.format(
                    config.hparams.clip_mode))
            train_op = optimizer.apply_gradients(zip(clipped_grads, var_list),
                                                 global_step=model.global_step,
                                                 name='train_step')

            logging_dict = {
                'global_step': model.global_step,
                'loss': model.loss
            }
            print("logging global step: ", logging_dict['global_step'],
                  " loss at this point: ", logging['loss'], " \n")
            hooks.append(
                tf.train.LoggingTensorHook(logging_dict, every_n_iter=100))
            if num_steps:
                hooks.append(tf.train.StopAtStepHook(last_step=num_steps))

            scaffold = tf.train.Scaffold(saver=tf.train.Saver(
                max_to_keep=checkpoints_to_keep,
                keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours))
            tf.contrib.training.train(train_op=train_op,
                                      logdir=train_dir,
                                      scaffold=scaffold,
                                      hooks=hooks,
                                      save_checkpoint_secs=60,
                                      master=master,
                                      is_chief=is_chief)
Example #49
0
    def __init__(self, s_size, a_size, scope, trainer):
        with tf.variable_scope(scope):
            #  distribution dqn
            self.atoms = 21
            self.v_max = 10.
            self.v_min = -10.
            self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)
            self.z = [self.v_min + i * self.delta_z for i in range(self.atoms)]

            #  network
            self.inputs = tf.placeholder(shape=[None, s_size],
                                         dtype=tf.float32)
            self.imageIn = tf.reshape(self.inputs, shape=[-1, 84, 84, 1])
            self.conv1 = slim.conv2d(activation_fn=tf.nn.relu,
                                     inputs=self.imageIn,
                                     num_outputs=32,
                                     kernel_size=[8, 8],
                                     stride=[4, 4],
                                     padding='VALID')
            self.conv2 = slim.conv2d(activation_fn=tf.nn.relu,
                                     inputs=self.conv1,
                                     num_outputs=64,
                                     kernel_size=[4, 4],
                                     stride=[2, 2],
                                     padding='VALID')
            self.conv3 = slim.conv2d(activation_fn=tf.nn.relu,
                                     inputs=self.conv2,
                                     num_outputs=64,
                                     kernel_size=[3, 3],
                                     stride=[1, 1],
                                     padding='VALID')
            hidden = slim.fully_connected(slim.flatten(self.conv3),
                                          512,
                                          activation_fn=tf.nn.relu)
            self.out = slim.fully_connected(
                hidden,
                a_size * self.atoms,
                activation_fn=None,
                weights_initializer=normalized_columns_initializer(0.1),
                biases_initializer=None)
            self.out = tf.reshape(self.out, [-1, a_size, self.atoms])

            self.p = tf.nn.softmax(self.out, dim=2)
            self.Q = tf.reduce_sum(self.z * self.p, axis=2)

            #Only the worker network need ops for loss functions and gradient updating.
            if scope != 'global':
                self.m_input = tf.placeholder(shape=[None, self.atoms],
                                              dtype=tf.float32)
                self.actions_p = tf.placeholder(
                    shape=[None, a_size, self.atoms], dtype=tf.float32)
                self.p_actiona = tf.multiply(self.p, self.actions_p)
                self.p_action = tf.reduce_sum(self.p_actiona, axis=1)
                self.p_alog = -tf.log(self.p_action +
                                      1e-20) + tf.log(self.m_input + 1e-20)
                self.loss = tf.reduce_mean(
                    tf.reduce_sum(self.m_input * self.p_alog, axis=1))
                local_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(
                    self.gradients, 40.0)
                #Apply local gradients to global network
                global_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(
                    zip(grads, global_vars))
Example #50
0
    def __init__(self,
                 vocab_size,
                 batch_size,
                 num_epochs,
                 check_point_step,
                 num_train_samples,
                 num_valid_samples,
                 num_layers,
                 num_hidden_units,
                 max_gradient_norm,
                 initial_learning_rate=1,
                 final_learning_rate=0.001
                 ):

        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.check_point_step = check_point_step
        self.num_train_samples = num_train_samples
        self.num_valid_samples = num_valid_samples
        self.num_layers = num_layers
        self.num_hidden_units = num_hidden_units
        self.max_gradient_norm = max_gradient_norm

        self.global_step = tf.Variable(0, trainable=False)

        # We set a dynamic learining rate, it decays every time the model has gone through 150 batches.
        # A minimum learning rate has also been set.
        self.learning_rate = tf.train.exponential_decay(initial_learning_rate, self.global_step,
                                           150, 0.96, staircase=True)
        self.learning_rate = tf.cond(tf.less(self.learning_rate, final_learning_rate), lambda: tf.constant(final_learning_rate),
                                     lambda: self.learning_rate)

        self.dropout_rate = tf.placeholder(tf.float32, name="dropout_rate")

        self.file_name_train = tf.placeholder(tf.string)
        self.file_name_validation = tf.placeholder(tf.string)
        self.file_name_test = tf.placeholder(tf.string)

        def parse(line):
            line_split = tf.string_split([line])
            input_seq = tf.string_to_number(line_split.values[:-1], out_type=tf.int32)
            output_seq = tf.string_to_number(line_split.values[1:], out_type=tf.int32)
            return input_seq, output_seq

        training_dataset = tf.data.TextLineDataset(self.file_name_train).map(parse).shuffle(256).padded_batch(self.batch_size, padded_shapes=([None], [None]))
        validation_dataset = tf.data.TextLineDataset(self.file_name_validation).map(parse).padded_batch(self.batch_size, padded_shapes=([None], [None]))
        test_dataset = tf.data.TextLineDataset(self.file_name_test).map(parse).batch(1)

        iterator = tf.contrib.data.Iterator.from_structure(training_dataset.output_types,
                                              training_dataset.output_shapes)

        self.input_batch, self.output_batch = iterator.get_next()

        self.trining_init_op = iterator.make_initializer(training_dataset)
        self.validation_init_op = iterator.make_initializer(validation_dataset)
        self.test_init_op = iterator.make_initializer(test_dataset)


        # Input embedding mat
        self.input_embedding_mat = tf.get_variable("input_embedding_mat",
                                                   [self.vocab_size, self.num_hidden_units],
                                                   dtype=tf.float32)

        self.input_embedded = tf.nn.embedding_lookup(self.input_embedding_mat, self.input_batch)

        # LSTM cell
        cell = tf.contrib.rnn.LSTMCell(self.num_hidden_units, state_is_tuple=True)
        cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=self.dropout_rate)
        cell = tf.contrib.rnn.MultiRNNCell(cells=[cell]*self.num_layers, state_is_tuple=True)

        self.cell = cell

        # Output embedding
        self.output_embedding_mat = tf.get_variable("output_embedding_mat",
                                                    [self.vocab_size, self.num_hidden_units],
                                                    dtype=tf.float32)

        self.output_embedding_bias = tf.get_variable("output_embedding_bias",
                                                     [self.vocab_size],
                                                     dtype=tf.float32)

        non_zero_weights = tf.sign(self.input_batch)
        self.valid_words = tf.reduce_sum(non_zero_weights)

        # Compute sequence length
        def get_length(non_zero_place):
            real_length = tf.reduce_sum(non_zero_place, 1)
            real_length = tf.cast(real_length, tf.int32)
            return real_length

        batch_length = get_length(non_zero_weights)


        # The shape of outputs is [batch_size, max_length, num_hidden_units]
        outputs, _ = tf.nn.dynamic_rnn(
            cell=self.cell,
            inputs=self.input_embedded,
            sequence_length=batch_length,
            dtype=tf.float32
        )

        def output_embedding(current_output):
            return tf.add(
                tf.matmul(current_output, tf.transpose(self.output_embedding_mat)), self.output_embedding_bias)

        # To compute the logits
        logits = tf.map_fn(output_embedding, outputs)
        logits = tf.reshape(logits, [-1, vocab_size])
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(self.output_batch, [-1]), logits=logits) \
               * tf.cast(tf.reshape(non_zero_weights, [-1]), tf.float32)

        self.loss = loss

        # Train

        params = tf.trainable_variables()

        opt = tf.train.AdagradOptimizer(self.learning_rate)
        gradients = tf.gradients(self.loss, params, colocate_gradients_with_ops=True)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
        self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
# finstate = array_ops.concat(1, [le_state, inp_state])
# outputs, finstate = rnn.rnn(neurons_out, outputs_le, finstate, scope="out")

# official seq2seq (perfect regression)

# outputs, finstate = ss.basic_rnn_seq2seq(inputs, targets, neurons)

loss = tf.add_n([
    tf.nn.l2_loss(target - output) for output, target in zip(outputs, targets)
]) / bptt_steps / batch_size / net_size

lr = tf.Variable(0.0, trainable=False)

tvars = tf.trainable_variables()
grads_raw = tf.gradients(loss, tvars)
grads, _ = tf.clip_by_global_norm(grads_raw, 5.0)

# optimizer = tf.train.GradientDescentOptimizer(lr)
# optimizer = tf.train.AdagradOptimizer(lr)
optimizer = tf.train.AdamOptimizer(lr)
# optimizer = tf.train.RMSPropOptimizer(lr)
# optimizer = tf.train.AdadeltaOptimizer(lr)

train_step = optimizer.apply_gradients(zip(grads, tvars))

train_data = np.load(
    pj(os.environ["HOME"], "Music", "ml", "test_licks.data.npy"))
input_size = train_data.shape[0]
corpus = dispatch_array(train_data, bptt_steps, batch_size)

sfn = 9
    def build(self):
        self.word_input_ids = tf.placeholder(tf.int32, [None, None],
                                             name='word_input')
        self.tag_input_ids = tf.placeholder(tf.int32, [None, None],
                                            name='tag_input')
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name='dropout_keep_prob')
        self.sequence_lengths = tf.placeholder(tf.int32, [None],
                                               name='sequence_lengths')
        self.char_input_ids = tf.placeholder(tf.int32, [None, None, None],
                                             name='char_input')
        self.word_lengths = tf.placeholder(tf.int32, [None, None],
                                           name='word_lengths')

        # word embedding
        embedded_words = self._word_embedding(self.word_input_ids)
        # if self.params['dropout']:
        #     embedded_words = tf.nn.dropout(embedded_words, self.dropout_keep_prob)
        self.batch_size = tf.shape(embedded_words)[0]
        self.max_sent_len = tf.shape(embedded_words)[1]

        # char embedding
        embedded_chars = self._char_embedding(self.char_input_ids)
        # if self.params['dropout']:
        #     embedded_chars = tf.nn.dropout(embedded_chars, self.dropout_keep_prob)
        self.max_char_len = tf.shape(embedded_chars)[2]

        char_output, char_hiddens = self._char_lstm(embedded_chars,
                                                    self.word_lengths)
        word_lstm_input = tf.concat([embedded_words, char_output], axis=-1)

        if self.params['char_attention']:
            context, self.batch_alphas = self._char_attention_layer(
                embedded_words, char_hiddens, self.word_lengths)
            word_lstm_input = tf.concat([word_lstm_input, context], axis=-1)

        if self.params['dropout']:
            word_lstm_input = tf.nn.dropout(word_lstm_input,
                                            self.dropout_keep_prob)

        word_bilstm_output = self._word_lstm(word_lstm_input,
                                             self.sequence_lengths)

        self.logits = self._label_prediction(word_bilstm_output)

        with tf.variable_scope('loss') as vs:
            if self.params['use_crf_loss']:
                log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
                    self.logits, self.tag_input_ids, self.sequence_lengths)
                self.word_loss = tf.reduce_mean(-log_likelihood,
                                                name='crf_negloglik_loss')
                # print self.transition_params.name
            else:
                # add softmax loss
                self.pred_tags = tf.cast(tf.argmax(self.logits, axis=-1),
                                         tf.int32)
                losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.tag_input_ids)
                mask = tf.sequence_mask(self.sequence_lengths)
                losses = tf.boolean_mask(losses, mask)
                self.word_loss = tf.reduce_mean(losses)

            print vs.name, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             scope=vs.name)

        self.total_loss = self.word_loss

        # optimization
        if self.params['lr_method'].lower() == 'adam':
            optimizer_total = tf.train.AdamOptimizer(self.params['lr_rate'])
            optimizer_word = tf.train.AdamOptimizer(self.params['lr_rate'])
            optimizer_char = tf.train.AdamOptimizer(self.params['lr_rate'])
        elif self.params['lr_method'].lower() == 'adagrad':
            optimizer_total = tf.train.AdagradOptimizer(self.params['lr_rate'])
            optimizer_word = tf.train.AdagradOptimizer(self.params['lr_rate'])
            optimizer_char = tf.train.AdagradOptimizer(self.params['lr_rate'])
        elif self.params['lr_method'].lower() == 'adadelta':
            optimizer_total = tf.train.AdadeltaOptimizer(
                self.params['lr_rate'])
            optimizer_word = tf.train.AdadeltaOptimizer(self.params['lr_rate'])
            optimizer_char = tf.train.AdadeltaOptimizer(self.params['lr_rate'])
        elif self.params['lr_method'].lower() == 'sgd':
            optimizer_total = tf.train.GradientDescentOptimizer(
                self.params['lr_rate'])
            optimizer_word = tf.train.GradientDescentOptimizer(
                self.params['lr_rate'])
            optimizer_char = tf.train.GradientDescentOptimizer(
                self.params['lr_rate'])
        elif self.params['lr_method'].lower() == 'rmsprop':
            optimizer_total = tf.train.RMSPropOptimizer(self.params['lr_rate'])
            optimizer_word = tf.train.RMSPropOptimizer(self.params['lr_rate'])
            optimizer_char = tf.train.RMSPropOptimizer(self.params['lr_rate'])
        elif self.params['lr_method'].lower() == 'momentum':
            optimizer_total = tf.train.MomentumOptimizer(
                self.params['lr_rate'], self.params['momentum'])
            optimizer_word = tf.train.MomentumOptimizer(
                self.params['lr_rate'], self.params['momentum'])
            optimizer_char = tf.train.MomentumOptimizer(
                self.params['lr_rate'], self.params['momentum'])

        if self.params['clip_norm'] > 0:
            grads, vs = zip(
                *optimizer_total.compute_gradients(self.total_loss))
            grads, gnorm = tf.clip_by_global_norm(grads,
                                                  self.params['clip_norm'])
            self.total_train_op = optimizer_total.apply_gradients(
                zip(grads, vs))

            grads, vs = zip(*optimizer_word.compute_gradients(self.word_loss))
            grads, gnorm = tf.clip_by_global_norm(grads,
                                                  self.params['clip_norm'])
            self.word_train_op = optimizer_word.apply_gradients(zip(grads, vs))
        else:
            self.total_train_op = optimizer_total.minimize(self.total_loss)
            self.word_train_op = optimizer_word.minimize(self.word_loss)

        return
Example #53
0
def train():
    """
        模型训练
    :return:
    """
    char2id, ner2id, pos2id = load_dict(char_dict="train_data_4/char2id.json",
                                        ner_dict="train_data_4/ner2id.json",
                                        pos_dict="train_data_4/pos2id.json")
    # tf.flags.DEFINE_string("data_dir", "data/data.dat", "data directory")
    tf.flags.DEFINE_integer("vocab_size_c", len(char2id), "vocabulary size")
    tf.flags.DEFINE_integer("vocab_size_p", len(pos2id), "vocabulary size")
    tf.flags.DEFINE_integer("num_classes", len(ner2id), "number of classes")
    tf.flags.DEFINE_integer("max_num", 384, "max_sentence_num")
    tf.flags.DEFINE_integer(
        "embedding_size_c", 256,
        "Dimensionality of character embedding (default: 200)")
    tf.flags.DEFINE_integer(
        "embedding_size_p", 256,
        "Dimensionality of character embedding (default: 200)")
    tf.flags.DEFINE_integer(
        "hidden_size", 128, "Dimensionality of GRU hidden layer (default: 50)")
    tf.flags.DEFINE_integer("batch_size", 256, "Batch Size (default: 64)")
    tf.flags.DEFINE_integer("num_epochs", 10,
                            "Number of training epochs (default: 50)")
    tf.flags.DEFINE_integer("checkpoint_every", 100,
                            "Save model after this many steps (default: 100)")
    tf.flags.DEFINE_integer("num_checkpoints", 3,
                            "Number of checkpoints to store (default: 5)")
    tf.flags.DEFINE_integer("evaluate_every", 300,
                            "evaluate every this many batches")
    tf.flags.DEFINE_float("learning_rate", 0.01, "learning rate")
    tf.flags.DEFINE_float("grad_clip", 5,
                          "grad clip to prevent gradient explode")
    FLAGS = tf.flags.FLAGS
    with tf.Session(config=config) as sess:
        ner = NER(vocab_size_c=FLAGS.vocab_size_c,
                  vocab_size_p=FLAGS.vocab_size_p,
                  num_classes=FLAGS.num_classes,
                  embedding_size_c=FLAGS.embedding_size_c,
                  embedding_size_p=FLAGS.embedding_size_p,
                  hidden_size=FLAGS.hidden_size,
                  max_num=FLAGS.max_num)

        # 外部定义 优化器
        global_step = tf.Variable(0, trainable=False)
        optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
        # RNN中常用的梯度截断,防止出现梯度过大难以求导的现象
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(ner.loss, tvars),
                                          FLAGS.grad_clip)
        grads_and_vars = tuple(zip(grads, tvars))
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

        saver = tf.train.Saver(tf.global_variables(),
                               max_to_keep=FLAGS.num_checkpoints)
        if not os.path.exists('./ckpt_3/'):
            os.makedirs("./ckpt_3/")

        # 恢复模型 / 重新初始化参数
        # model_file = tf.train.latest_checkpoint('./ckpt/')
        ckpt = tf.train.get_checkpoint_state('./ckpt_3/')
        if ckpt:
            print("load saved model:\t", ckpt.model_checkpoint_path)
            saver = tf.train.Saver()
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print("init model...")
            sess.run(tf.global_variables_initializer())

        def extract(p):
            # 封装一下,输出结果

            IOS = []
            index = 0
            start = None
            for i in p:
                if i == 0:
                    if start is None:
                        pass
                    else:
                        IOS.append((start, index))
                    break
                elif i == 1:
                    if start is None:
                        pass
                    else:
                        if index > 0:
                            IOS.append((start, index))
                        start = None
                else:  # 包含实体
                    if start is None:
                        start = index
                    else:
                        if i == p[index - 1]:
                            pass
                        else:
                            IOS.append((start, index))
                            start = index
                index += 1

            return IOS

        def evaluate(viterbi_sequence, Y):
            '''
                计算变长的 准确率 指标
            :return:
            '''
            TP = 0
            P_ = 0
            R_ = 0
            for p, y in zip(viterbi_sequence, Y):
                # 当前句子的长度
                pre_ = extract(p)
                tru_ = extract(y)
                # 计算 acc
                comm = [i for i in pre_ if i in tru_]
                TP += len(comm)
                P_ += len(pre_)
                R_ += len(tru_)
                # l = len(np.nonzero(y))
                # # 通过两个序列,计算准确率
                # t_all += l
                # t_true += np.sum(np.equal(p[:l], y[:l]))

            return TP, P_, R_

        def train_step(x, pos, y):
            feed_dict = {
                ner.input_chars: x,
                ner.input_pos: pos,
                ner.output: y,
                ner.is_training: True,
            }
            _, step, predicts_t, cost, accuracy = sess.run([
                train_op, global_step, ner.viterbi_sequence, ner.loss, ner.acc
            ], feed_dict)
            tp, p_, r_ = evaluate(np.array(predicts_t), y)
            time_str = str(int(time.time()))
            p = float(tp) / p_ if p_ else 0
            r = float(tp) / r_ if r_ else 0
            if p + r:
                f = 2 * p * r / (p + r)
            else:
                f = 0
            print("{}: step {}, loss {},  p {}, r {}, f {}".format(
                time_str, step, cost, p, r, f))
            # train_summary_writer.add_summary(summaries, step)
            return step

        def dev_step(x, pos, y, writer=None):
            feed_dict = {
                ner.input_chars: x,
                ner.input_pos: pos,
                ner.output: y,
                ner.is_training: False,
            }
            step, predicts_d, cost, accuracy = sess.run(
                [global_step, ner.viterbi_sequence, ner.loss, ner.acc],
                feed_dict)

            tp, p_, r_ = evaluate(np.array(predicts_d), y)

            time_str = str(int(time.time()))
            p = float(tp) / p_ if p_ else 0
            r = float(tp) / r_ if r_ else 0
            if p + r:
                f = 2 * p * r / (p + r)
            else:
                f = 0
            print("+dev+{}: step {}, loss {}, p {}, r {}, f {}".format(
                time_str, step, cost, p, r, f))
            # time_str = str(int(time.time()))
            # print("+dev+{}: step {}, loss {}, f_acc {}, t_acc {}".format(time_str, step, cost, accuracy, acc_d))
            return cost, tp, p_, r_

        best_accuracy, best_at_step = 0, 0

        train_example_len = 173109
        dev_example_len = 21639
        num_train_steps = int(train_example_len / FLAGS.batch_size *
                              FLAGS.num_epochs)
        num_dev_steps = int(dev_example_len / FLAGS.batch_size)

        min_loss = 99999

        input_ids_train, input_pos_train, output_types_train = get_input_data(
            "./train_data_4/train_ner.tf_record", FLAGS.batch_size)
        input_ids_dev, input_pos_dev, output_types_dev = get_input_data(
            "./train_data_4/dev_ner.tf_record", FLAGS.batch_size)
        for i in range(num_train_steps):
            # batch 数据
            input_ids_train_, input_pos_train_, output_types_train_ = sess.run(
                [input_ids_train, input_pos_train, output_types_train])
            step = train_step(input_ids_train_, input_pos_train_,
                              output_types_train_)
            if step % FLAGS.evaluate_every == 0:
                # dev 数据过大, 也需要进行 分批
                TP = 0
                P_ = 0
                R_ = 0
                total_loss = 0
                for j in range(num_dev_steps):
                    input_ids_dev_, input_pos_dev_, output_types_dev_ = sess.run(
                        [input_ids_dev, input_pos_dev, output_types_dev])
                    loss, tp, p_, r_ = dev_step(input_ids_dev_, input_pos_dev_,
                                                output_types_dev_)
                    TP += tp
                    P_ += p_
                    R_ += r_
                    total_loss += loss
                    # total_dev_correct += count
                    # total_devs += total
                p = float(TP) / P_ if P_ else 0
                r = float(TP) / R_ if R_ else 0
                f = 2 * p * r / (p + r) if p + r else 0
                print("tp:p", TP, p)
                print("p_:r", P_, r)
                print("r_:f", R_, f)
                if total_loss < min_loss:
                    print("save model:\t%f\t>%f\t%f\t>%f" %
                          (total_loss, p, r, f))
                    min_loss = total_loss
                    saver.save(sess, './ckpt_3/ner.ckpt', global_step=step)

        sess.close()
Example #54
0
def train():
    num_classes = get_num_classes(FLAGS.train_set)
    model = Very_deep_cnn(batch_size=FLAGS.batch_size,
                          num_classes=num_classes,
                          depth=FLAGS.depth,
                          num_embedding=len(FLAGS.alphabet))

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = True

        training_set, num_training_iters = create_dataset(
            FLAGS.train_set, FLAGS.alphabet, FLAGS.max_length,
            FLAGS.batch_size, True)
        test_set, num_test_iters = create_dataset(FLAGS.test_set,
                                                  FLAGS.alphabet,
                                                  FLAGS.max_length,
                                                  FLAGS.batch_size, False)
        train_iterator = training_set.make_initializable_iterator()
        test_iterator = test_set.make_initializable_iterator()

        handle = tf.placeholder(tf.string, shape=[])
        is_training = tf.placeholder(tf.bool, name='is_training')

        iterator = tf.data.Iterator.from_string_handle(
            handle, training_set.output_types, training_set.output_shapes)
        texts, labels = iterator.get_next()

        logits = model.forward(texts, is_training)
        loss = model.loss(logits, labels)
        loss_summary = tf.summary.scalar("loss", loss)
        accuracy = model.accuracy(logits, labels)
        accuracy_sumary = tf.summary.scalar("accuracy", accuracy)
        batch_size = tf.unstack(tf.shape(texts))[0]
        confusion = model.confusion_matrix(logits, labels)
        global_step = tf.Variable(0, name="global_step", trainable=False)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            lr = tf.train.exponential_decay(FLAGS.lr,
                                            global_step,
                                            FLAGS.num_epochs *
                                            num_training_iters,
                                            0.96,
                                            staircase=True)
            optimizer = tf.train.MomentumOptimizer(lr, FLAGS.momentum)
            gradients, variables = zip(*optimizer.compute_gradients(loss))
            gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
            train_op = optimizer.apply_gradients(zip(gradients, variables),
                                                 global_step=global_step)

        merged = tf.summary.merge([loss_summary, accuracy_sumary])
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        if os.path.isdir(FLAGS.log_path):
            shutil.rmtree(FLAGS.log_path)
        os.makedirs(FLAGS.log_path)
        #if os.path.isdir(FLAGS.saved_path):
        #shutil.rmtree(FLAGS.saved_path)
        os.makedirs(FLAGS.saved_path)
        output_file = open(FLAGS.saved_path + os.sep + "logs.txt", "w")
        output_file.write("Model's parameters: {}".format(
            FLAGS.flag_values_dict()))
        best_loss = 1e5
        best_epoch = 0
        with tf.Session(config=session_conf) as sess:
            train_writer = tf.summary.FileWriter(
                FLAGS.log_path + os.sep + 'train', sess.graph)
            test_writer = tf.summary.FileWriter(FLAGS.log_path + os.sep +
                                                'test')
            sess.run(init)
            for epoch in range(FLAGS.num_epochs):
                sess.run(train_iterator.initializer)
                sess.run(test_iterator.initializer)
                train_handle = sess.run(train_iterator.string_handle())
                test_handle = sess.run(test_iterator.string_handle())
                train_iter = 0
                while True:
                    try:
                        _, tr_loss, tr_accuracy, summary, step = sess.run(
                            [train_op, loss, accuracy, merged, global_step],
                            feed_dict={
                                handle: train_handle,
                                is_training: True
                            })
                        print(
                            "Epoch: {}/{}, Iteration: {}/{}, Loss: {}, Accuracy: {}"
                            .format(epoch + 1, FLAGS.num_epochs,
                                    train_iter + 1, num_training_iters,
                                    tr_loss, tr_accuracy))
                        train_writer.add_summary(summary, step)
                        train_iter += 1
                    except (tf.errors.OutOfRangeError, StopIteration):
                        break
                if epoch % FLAGS.test_interval == 0:
                    loss_ls = []
                    loss_summary = tf.Summary()
                    accuracy_ls = []
                    accuracy_summary = tf.Summary()
                    confusion_matrix = np.zeros([num_classes, num_classes],
                                                np.int32)
                    num_samples = 0
                    while True:
                        try:
                            test_loss, test_accuracy, test_confusion, samples = sess.run(
                                [loss, accuracy, confusion, batch_size],
                                feed_dict={
                                    handle: test_handle,
                                    is_training: False
                                })
                            loss_ls.append(test_loss * samples)
                            accuracy_ls.append(test_accuracy * samples)
                            confusion_matrix += test_confusion
                            num_samples += samples
                        except (tf.errors.OutOfRangeError, StopIteration):
                            break

                    mean_test_loss = sum(loss_ls) / num_samples
                    loss_summary.value.add(tag='loss',
                                           simple_value=mean_test_loss)
                    test_writer.add_summary(loss_summary, epoch)
                    mean_test_accuracy = sum(accuracy_ls) / num_samples
                    accuracy_summary.value.add(tag='accuracy',
                                               simple_value=mean_test_accuracy)
                    test_writer.add_summary(accuracy_summary, epoch)

                    output_file.write(
                        "Epoch: {}/{} \nTest loss: {} Test accuracy: {} \nTest confusion matrix: \n{}\n\n"
                        .format(epoch + 1, FLAGS.num_epochs, mean_test_loss,
                                mean_test_accuracy, confusion_matrix))
                    print("Epoch: {}/{}, Final loss: {}, Final accuracy: {}".
                          format(epoch + 1, FLAGS.num_epochs, mean_test_loss,
                                 mean_test_accuracy))
                    if mean_test_loss + FLAGS.es_min_delta < best_loss:
                        best_loss = mean_test_loss
                        best_epoch = epoch
                        saver.save(
                            sess, "{}/char_level_cnn".format(FLAGS.saved_path))
                    if epoch - best_epoch > FLAGS.es_patience > 0:
                        print(
                            "Stop training at epoch {}. The lowest loss achieved is {} at epoch {}"
                            .format(epoch, best_loss, best_epoch))
                        break

        output_file.close()
Example #55
0
def optimize(loss,
             global_step,
             max_grad_norm,
             lr,
             lr_decay,
             sync_replicas=False,
             replicas_to_aggregate=1,
             task_id=0):
    """Builds optimization graph.

  * Creates an optimizer, and optionally wraps with SyncReplicasOptimizer
  * Computes, clips, and applies gradients
  * Maintains moving averages for all trainable variables
  * Summarizes variables and gradients

  Args:
    loss: scalar loss to minimize.
    global_step: integer scalar Variable.
    max_grad_norm: float scalar. Grads will be clipped to this value.
    lr: float scalar, learning rate.
    lr_decay: float scalar, learning rate decay rate.
    sync_replicas: bool, whether to use SyncReplicasOptimizer.
    replicas_to_aggregate: int, number of replicas to aggregate when using
      SyncReplicasOptimizer.
    task_id: int, id of the current task; used to ensure proper initialization
      of SyncReplicasOptimizer.

  Returns:
    train_op
  """
    with tf.name_scope('optimization'):
        # Compute gradients.
        tvars = tf.trainable_variables()
        grads = tf.gradients(
            loss,
            tvars,
            aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)

        # Clip non-embedding grads
        non_embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars)
                                        if 'embedding' not in v.op.name]
        embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars)
                                    if 'embedding' in v.op.name]

        ne_grads, ne_vars = list(zip(*non_embedding_grads_and_vars))
        ne_grads, _ = tf.clip_by_global_norm(ne_grads, max_grad_norm)
        non_embedding_grads_and_vars = list(zip(ne_grads, ne_vars))

        grads_and_vars = embedding_grads_and_vars + non_embedding_grads_and_vars

        # Summarize
        _summarize_vars_and_grads(grads_and_vars)

        # Decaying learning rate
        lr = tf.train.exponential_decay(lr,
                                        global_step,
                                        1,
                                        lr_decay,
                                        staircase=True)
        tf.summary.scalar('learning_rate', lr)
        opt = tf.train.AdamOptimizer(lr)

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            0.999, global_step)

        # Apply gradients
        if sync_replicas:
            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate,
                variable_averages=variable_averages,
                variables_to_average=tvars,
                total_num_replicas=replicas_to_aggregate)
            apply_gradient_op = opt.apply_gradients(grads_and_vars,
                                                    global_step=global_step)
            with tf.control_dependencies([apply_gradient_op]):
                train_op = tf.no_op(name='train_op')

            # Initialization ops
            tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                                 opt.get_chief_queue_runner())
            if task_id == 0:  # Chief task
                local_init_op = opt.chief_init_op
                tf.add_to_collection('chief_init_op', opt.get_init_tokens_op())
            else:
                local_init_op = opt.local_step_init_op
            tf.add_to_collection('local_init_op', local_init_op)
            tf.add_to_collection('ready_for_local_init_op',
                                 opt.ready_for_local_init_op)
        else:
            # Non-sync optimizer
            variables_averages_op = variable_averages.apply(tvars)
            apply_gradient_op = opt.apply_gradients(grads_and_vars,
                                                    global_step)
            with tf.control_dependencies(
                [apply_gradient_op, variables_averages_op]):
                train_op = tf.no_op(name='train_op')

        return train_op
Example #56
0
    def __init__(self, args, infer=False):
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        if args.model == 'rnn':
            cell_fn = rnn.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn.BasicLSTMCell
        elif args.model == 'nas':
            cell_fn = rnn.NASCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        with tf.device(args.device):
            cells = []
            for _ in range(args.num_layers):
                cell = cell_fn(args.rnn_size)
                if not infer and (args.output_keep_prob < 1.0
                                  or args.input_keep_prob < 1.0):
                    cell = rnn.DropoutWrapper(
                        cell,
                        input_keep_prob=args.input_keep_prob,
                        output_keep_prob=args.output_keep_prob)
                cells.append(cell)

            self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True)

            self.input_data = tf.placeholder(
                tf.int32, [args.batch_size, args.seq_length], name='input')
            self.targets = tf.placeholder(tf.int32,
                                          [args.batch_size, args.seq_length],
                                          name='target')
            self.initial_state = cell.zero_state(args.batch_size, tf.float32)

            with tf.variable_scope('rnnlm'):
                softmax_w = tf.get_variable("softmax_w",
                                            [args.rnn_size, args.vocab_size])
                softmax_b = tf.get_variable("softmax_b", [args.vocab_size])

            embedding = tf.get_variable("embedding",
                                        [args.vocab_size, args.rnn_size])
            inputs = tf.nn.embedding_lookup(embedding, self.input_data)

            if not infer and args.output_keep_prob:
                inputs = tf.nn.dropout(inputs, args.output_keep_prob)

            inputs = tf.split(inputs, args.seq_length, 1)
            inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

            def loop(prev, _):
                prev = tf.matmul(prev, softmax_w) + softmax_b
                prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
                return tf.nn.embedding_lookup(embedding, prev_symbol)

            outputs, last_state = legacy_seq2seq.rnn_decoder(
                inputs,
                self.initial_state,
                cell,
                loop_function=loop if infer else None,
                scope='rnnlm')
            output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size])
            self.logits = tf.matmul(output, softmax_w) + softmax_b
            self.probs = tf.nn.softmax(self.logits)
            loss = legacy_seq2seq.sequence_loss_by_example(
                [self.logits], [tf.reshape(self.targets, [-1])],
                [tf.ones([args.batch_size * args.seq_length])],
                args.vocab_size)

            self.word_len = tf.placeholder(tf.int32,
                                           shape=[args.batch_size],
                                           name='word_lengths')

            mask = tf.sequence_mask(self.word_len,
                                    args.seq_length,
                                    dtype=tf.float32)
            mask = tf.reshape(mask, [-1])
            loss = tf.multiply(mask, loss)

            self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
            self.final_state = last_state
            self.lr = tf.Variable(0.0, trainable=False)
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                              args.grad_clip)
            with tf.name_scope('optimizer'):
                optimizer = tf.train.AdamOptimizer(self.lr)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        # instrument tensorboard
        tf.summary.histogram('logits', self.logits)
        tf.summary.histogram('loss', loss)
        tf.summary.scalar('train_loss', self.cost)
Example #57
0
    def __init__(self, is_training, config, ptb_input):
        self._input = ptb_input
        self._is_training = is_training

        batch_size = ptb_input.batch_size
        num_steps = ptb_input.num_steps  # 反向传播的展开步数(状态数)
        hidden_size = config.hidden_size  # LSTMCell的节点数(隐层列个数)
        vocab_size = config.vocab_size  # 词汇表大小(输出层列个数)

        def lstm_cell():
            """返回一个LSTMcell,每个cell是一个单隐层的网络"""
            return rnn.BasicLSTMCell(hidden_size, forget_bias=0.0, reuse=tf.get_variable_scope().reuse)

        attn_cell = lstm_cell
        if is_training and config.keep_prob < 1:
            def attn_cell():
                """若需要dropout则返回一个经过dropout的cell"""
                return rnn.DropoutWrapper(lstm_cell(), output_keep_prob=config.keep_prob)
        cell = rnn.MultiRNNCell([attn_cell() for _ in range(config.num_layers)])
        """用num_layers个LSTMCell堆叠成一个cell,即一个cell中,第一个LSTMCell的输出变成下一个LSTMCell的输入"""

        # 初始状态
        self._initial_state = cell.zero_state(batch_size, tf.float32)
        """state是个tuple,大小为num_layers"""

        # 输入
        with tf.device('/cpu:0'):
            embedding = tf.get_variable('embedding', (vocab_size, hidden_size), tf.float32)
            inputs = tf.nn.embedding_lookup(embedding, ptb_input.input_data)
            """inputs[batch_size, num_steps, hidden_size],其中第二个维度在vocab_size中取值
                num_steps个cell的输入,每个cell的inputs是 [batch, hidden_size]
            """
        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        # 隐层输出
        outputs = list()
        state = self._initial_state  # 细胞状态
        with tf.variable_scope('RNN'):
            for time_step in range(num_steps):
                if time_step > 0:
                    tf.get_variable_scope().reuse_variables()
                cell_output, state = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)
                """outputs[num_steps, batch_size, hidden_size]"""
        outputs_flat = tf.reshape(tf.concat(outputs, 1), (-1, hidden_size))
        """outputs_flat:[y1, y2, y3, y1, y2, y3, ...].T"""

        # 输出层
        softmax_w = tf.get_variable('softmax_w', (hidden_size, vocab_size), tf.float32)
        softmax_b = tf.get_variable('sotfmax_b', [vocab_size], tf.float32)
        logits = tf.nn.bias_add(tf.matmul(outputs_flat, softmax_w), softmax_b)
        """logits[num_steps * batch_size, vocab_size]"""
        loss = legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(ptb_input.targets, [-1])],
                                                       [tf.ones([batch_size * num_steps])])
        """对每个logit,target对分别计算loss然后对这些loss进行加权求和"""
        self._cost = tf.reduce_sum(loss) / batch_size
        self._final_state = state
        tf.summary.histogram('softmax_w', softmax_w)
        tf.summary.histogram('softmax_b', softmax_b)
        tf.summary.scalar('cost', self._cost)

        if not is_training:
            return

        # 优化
        self._lr = tf.Variable(0.0, trainable=False)
        trainable_var = tf.trainable_variables()  # 获取所有可训练的变量
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_var), config.max_grad_norm)  # 梯度截断
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self._train_op = optimizer.apply_gradients(zip(grads, trainable_var),
                                                   global_step=framework.get_or_create_global_step())
        self.new_lr = tf.placeholder(tf.float32, [], name='new_learing_rate')
        self.lr_update = tf.assign(self.lr, self.new_lr)
        tf.summary.scalar('lr', self._lr)
        self._merge = tf.summary.merge_all()
Example #58
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 state_size,
                 num_layers,
                 embedding_size,
                 max_gradient,
                 batch_size,
                 learning_rate,
                 forward_only=False,
                 dtype=tf.float32):

        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.global_step = tf.Variable(0, trainable=False, name="global_step")
        self.state_size = state_size

        self.encoder_inputs = tf.placeholder(tf.int32,
                                             shape=[self.batch_size, None])
        self.decoder_inputs = tf.placeholder(tf.int32,
                                             shape=[self.batch_size, None])
        self.decoder_targets = tf.placeholder(tf.int32,
                                              shape=[self.batch_size, None])
        self.encoder_len = tf.placeholder(tf.int32, shape=[self.batch_size])
        self.decoder_len = tf.placeholder(tf.int32, shape=[self.batch_size])
        self.beam_tok = tf.placeholder(tf.int32, shape=[self.batch_size])
        self.prev_att = tf.placeholder(tf.float32,
                                       shape=[self.batch_size, state_size * 2])

        encoder_fw_cell = tf.contrib.rnn.GRUCell(state_size)
        encoder_bw_cell = tf.contrib.rnn.GRUCell(state_size)
        decoder_cell = tf.contrib.rnn.GRUCell(state_size)

        if not forward_only:
            encoder_fw_cell = tf.contrib.rnn.DropoutWrapper(
                encoder_fw_cell, output_keep_prob=0.50)
            encoder_bw_cell = tf.contrib.rnn.DropoutWrapper(
                encoder_bw_cell, output_keep_prob=0.50)
            decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell,
                                                         output_keep_prob=0.50)

        with tf.variable_scope("seq2seq", dtype=dtype):
            with tf.variable_scope("encoder"):

                encoder_emb = tf.get_variable(
                    "embedding", [source_vocab_size, embedding_size],
                    initializer=emb_init)

                encoder_inputs_emb = tf.nn.embedding_lookup(
                    encoder_emb, self.encoder_inputs)

                encoder_outputs, encoder_states = \
                    tf.nn.bidirectional_dynamic_rnn(
                        encoder_fw_cell, encoder_bw_cell, encoder_inputs_emb,
                        sequence_length=self.encoder_len, dtype=dtype)

            with tf.variable_scope("init_state"):
                init_state = fc_layer(tf.concat(encoder_states, 1), state_size)
                # the shape of bidirectional_dynamic_rnn is weird
                # None for batch_size
                self.init_state = init_state
                self.init_state.set_shape([self.batch_size, state_size])
                self.att_states = tf.concat(encoder_outputs, 2)
                self.att_states.set_shape(
                    [self.batch_size, None, state_size * 2])

            with tf.variable_scope("attention"):
                attention = tf.contrib.seq2seq.BahdanauAttention(
                    state_size, self.att_states, self.encoder_len)
                decoder_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(
                    decoder_cell, attention, state_size * 2)
                wrapper_state = tf.contrib.seq2seq.DynamicAttentionWrapperState(
                    self.init_state, self.prev_att)

            with tf.variable_scope("decoder") as scope:

                decoder_emb = tf.get_variable(
                    "embedding", [target_vocab_size, embedding_size],
                    initializer=emb_init)

                decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(
                    decoder_cell, target_vocab_size)

                if not forward_only:
                    decoder_inputs_emb = tf.nn.embedding_lookup(
                        decoder_emb, self.decoder_inputs)

                    helper = tf.contrib.seq2seq.TrainingHelper(
                        decoder_inputs_emb, self.decoder_len)
                    decoder = tf.contrib.seq2seq.BasicDecoder(
                        decoder_cell, helper, wrapper_state)

                    outputs, final_state = \
                        tf.contrib.seq2seq.dynamic_decode(decoder)

                    outputs_logits = outputs[0]
                    self.outputs = outputs_logits

                    weights = tf.sequence_mask(self.decoder_len,
                                               dtype=tf.float32)

                    loss_t = tf.contrib.seq2seq.sequence_loss(
                        outputs_logits,
                        self.decoder_targets,
                        weights,
                        average_across_timesteps=False,
                        average_across_batch=False)
                    self.loss = tf.reduce_sum(loss_t) / self.batch_size

                    params = tf.trainable_variables()
                    opt = tf.train.AdadeltaOptimizer(self.learning_rate,
                                                     epsilon=1e-6)
                    gradients = tf.gradients(self.loss, params)
                    clipped_gradients, norm = \
                        tf.clip_by_global_norm(gradients, max_gradient)
                    self.updates = opt.apply_gradients(
                        zip(clipped_gradients, params),
                        global_step=self.global_step)

                    tf.summary.scalar('loss', self.loss)
                else:
                    self.loss = tf.constant(0)
                    with tf.variable_scope("proj") as scope:
                        output_fn = lambda x: fc_layer(
                            x, target_vocab_size, scope=scope)

                    st_toks = tf.convert_to_tensor([data_util.ID_GO] *
                                                   batch_size,
                                                   dtype=tf.int32)

                    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                        decoder_emb, st_toks, data_util.ID_EOS)

                    decoder = tf.contrib.seq2seq.BasicDecoder(
                        decoder_cell, helper, wrapper_state)

                    outputs, final_state = \
                        tf.contrib.seq2seq.dynamic_decode(decoder)

                    self.outputs = outputs[0]

                    # single step decode for beam search
                    with tf.variable_scope("decoder", reuse=True):
                        beam_emb = tf.nn.embedding_lookup(
                            decoder_emb, self.beam_tok)
                        self.beam_outputs, self.beam_nxt_state, _, _ = \
                            decoder.step(0, beam_emb, wrapper_state)
                        self.beam_logsoftmax = \
                            tf.nn.log_softmax(self.beam_outputs[0])

        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=0)
        self.summary_merge = tf.summary.merge_all()
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False,
                 scope_name='seq2seq',
                 dtype=tf.float32):
        """Create the model.

        Args:
          source_vocab_size: size of the source vocabulary.
          target_vocab_size: size of the target vocabulary.
          buckets: a list of pairs (I, O), where I specifies maximum input length
            that will be processed in that bucket, and O specifies maximum output
            length. Training instances that have inputs longer than I or outputs
            longer than O will be pushed to the next bucket and padded accordingly.
            We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
          size: number of units in each layer of the model.
          num_layers: number of layers in the model.
          max_gradient_norm: gradients will be clipped to maximally this norm.
          batch_size: the size of the batches used during training;
            the model construction is independent of batch_size, so it can be
            changed after initialization if this is convenient, e.g., for decoding.
          learning_rate: learning rate to start with.
          learning_rate_decay_factor: decay learning rate by this much when needed.
          use_lstm: if true, we use LSTM cells instead of GRU cells.
          num_samples: number of samples for sampled softmax.
          forward_only: if set, we do not construct the backward pass in the model.
          dtype: the data type to use to store internal variables.
        """

        self.scope_name = scope_name
        with tf.variable_scope(self.scope_name):
            self.source_vocab_size = source_vocab_size
            self.target_vocab_size = target_vocab_size
            self.buckets = buckets
            self.batch_size = batch_size
            self.learning_rate = tf.Variable(float(learning_rate),
                                             trainable=False,
                                             dtype=dtype)
            self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * learning_rate_decay_factor)
            self.global_step = tf.Variable(0, trainable=False)
            self.dummy_dialogs = []  # [TODO] load dummy sentences

            # If we use sampled softmax, we need an output projection.
            output_projection = None
            softmax_loss_function = None
            # Sampled softmax only makes sense if we sample less than vocabulary size.
            if num_samples > 0 and num_samples < self.target_vocab_size:
                w_t = tf.get_variable("proj_w", [self.target_vocab_size, size],
                                      dtype=dtype)
                w = tf.transpose(w_t)
                b = tf.get_variable("proj_b", [self.target_vocab_size],
                                    dtype=dtype)
                output_projection = (w, b)

                def sampled_loss(labels, inputs):
                    labels = tf.reshape(labels, [-1, 1])
                    # We need to compute the sampled_softmax_loss using 32bit floats to
                    # avoid numerical instabilities.
                    local_w_t = tf.cast(w_t, tf.float32)
                    local_b = tf.cast(b, tf.float32)
                    local_inputs = tf.cast(inputs, tf.float32)
                    return tf.cast(
                        tf.nn.sampled_softmax_loss(
                            weights=local_w_t,
                            biases=local_b,
                            labels=labels,
                            inputs=local_inputs,
                            num_sampled=num_samples,
                            num_classes=self.target_vocab_size), dtype)

                softmax_loss_function = sampled_loss

            # cells=[]
            # for _ in range(num_layers):
            #     cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(size))
            #     cells.append(cell)
            # cell = tf.contrib.rnn.MultiRNNCell(cells)

            # Create the internal multi-layer cell for our RNN.
            def single_cell():
                return tf.contrib.rnn.GRUCell(size)

            if use_lstm:
                import pdb
                pdb.set_trace()

                def single_cell():
                    return tf.contrib.rnn.BasicLSTMCell(size)

            cell = single_cell()
            if num_layers > 1:
                cell = tf.contrib.rnn.MultiRNNCell(
                    [single_cell() for _ in range(num_layers)])

            # The seq2seq function: we use embedding for the input and attention.
            def seq2seq_f(encoder_inputs, decoder_inputs, feed_previous):
                return tf_seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=size,
                    output_projection=output_projection,
                    feed_previous=feed_previous,  # do_decode,
                    dtype=dtype)

            # Feeds for inputs.
            self.encoder_inputs = []
            self.decoder_inputs = []
            self.target_weights = []
            for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
                self.encoder_inputs.append(
                    tf.placeholder(tf.int32,
                                   shape=[None],
                                   name="encoder{0}".format(i)))
            for i in xrange(buckets[-1][1] + 1):
                self.decoder_inputs.append(
                    tf.placeholder(tf.int32,
                                   shape=[None],
                                   name="decoder{0}".format(i)))
                self.target_weights.append(
                    tf.placeholder(dtype,
                                   shape=[None],
                                   name="weight{0}".format(i)))

            # Our targets are decoder inputs shifted by one.
            targets = [
                self.decoder_inputs[i + 1]
                for i in xrange(len(self.decoder_inputs) - 1)
            ]

            # for reinforcement learning
            # self.force_dec_input = tf.placeholder(tf.bool, name="force_dec_input")
            # self.en_output_proj = tf.placeholder(tf.bool, name="en_output_proj")

            # Training outputs and losses.
            if forward_only:
                self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    buckets,
                    lambda x, y: seq2seq_f(x, y, True),
                    softmax_loss_function=softmax_loss_function)
                # If we use output projection, we need to project outputs for decoding.
                if output_projection is not None:
                    for b in xrange(len(buckets)):
                        self.outputs[b] = [
                            tf.matmul(output, output_projection[0]) +
                            output_projection[1] for output in self.outputs[b]
                        ]
            else:
                self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    buckets,
                    lambda x, y: seq2seq_f(x, y, False),
                    softmax_loss_function=softmax_loss_function)

            # Gradients and SGD update operation for training the model.
            params = tf.trainable_variables()
            # if not forward_only:
            self.gradient_norms = []
            self.updates = []
            self.advantage = [
                tf.placeholder(tf.float32, name="advantage_%i" % i)
                for i in xrange(len(buckets))
            ]
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                # self.losses[b] = tf.subtract(self.losses[b], self.advantage[b])
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

            all_variables = tf.global_variables()
            all_variables = [
                k for k in tf.global_variables()
                if k.name.startswith(self.scope_name)
            ]
            self.saver = tf.train.Saver(all_variables)
Example #60
0
    def __init__(self, sess, config, data_feed, log_dir):

        vocab_size = len(data_feed.vocab)
        self.data_feed = data_feed

        with tf.name_scope("io"):
            self.inputs = tf.placeholder(dtype=tf.int32,
                                         shape=(None, None),
                                         name="input_seq")
            self.input_lens = tf.placeholder(dtype=tf.int32,
                                             shape=(None, ),
                                             name="seq_len")
            self.da_labels = tf.placeholder(dtype=tf.int32,
                                            shape=(None, ),
                                            name="dialog_acts")
            self.senti_labels = tf.placeholder(
                dtype=tf.float32,
                shape=(None, data_feed.feature_size[data_feed.SENTI_ID]),
                name="sentiments")

            self.learning_rate = tf.Variable(float(config.init_lr),
                                             trainable=False)
            self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * config.lr_decay)

        max_sent_len = array_ops.shape(self.inputs)[1]
        batch_size = array_ops.shape(self.inputs)[0]

        with variable_scope.variable_scope("word-embedding"):
            embedding = tf.get_variable("embedding",
                                        [vocab_size, config.embed_size],
                                        dtype=tf.float32)
            input_embedding = embedding_ops.embedding_lookup(
                embedding,
                tf.squeeze(tf.reshape(self.inputs, [-1, 1]), squeeze_dims=[1]))

            input_embedding = tf.reshape(input_embedding,
                                         [-1, max_sent_len, config.embed_size])

        with variable_scope.variable_scope("rnn"):
            if config.cell_type == "gru":
                cell = rnn_cell.GRUCell(config.cell_size)
            elif config.cell_type == "lstm":
                cell = rnn_cell.LSTMCell(config.cell_size,
                                         use_peepholes=False,
                                         forget_bias=1.0)
            elif config.cell_type == "rnn":
                cell = rnn_cell.BasicRNNCell(config.cell_size)
            else:
                raise ValueError("unknown RNN type")

            if config.keep_prob < 1.0:
                cell = rnn_cell.DropoutWrapper(
                    cell,
                    output_keep_prob=config.keep_prob,
                    input_keep_prob=config.keep_prob)

            if config.num_layer > 1:
                cell = rnn_cell.MultiRNNCell([cell] * config.num_layer,
                                             state_is_tuple=True)

            # and enc_last_state will be same as the true last state
            outputs, _ = tf.nn.dynamic_rnn(
                cell,
                input_embedding,
                dtype=tf.float32,
                sequence_length=self.input_lens,
            )
            # get the TRUE last outputs
            last_outputs = tf.reduce_sum(
                tf.mul(
                    outputs,
                    tf.expand_dims(
                        tf.one_hot(self.input_lens - 1, max_sent_len), -1)), 1)

            self.dialog_acts = self.fnn(
                last_outputs, data_feed.feature_size[data_feed.DA_ID], [100],
                "dialog_act_fnn")
            self.sentiments = self.fnn(
                last_outputs, data_feed.feature_size[data_feed.SENTI_ID],
                [100], "setiment_fnn")

        self.loss = tf.reduce_sum(nn_ops.sparse_softmax_cross_entropy_with_logits(self.dialog_acts, self.da_labels)) \
                    + tf.reduce_sum(nn_ops.softmax_cross_entropy_with_logits(self.sentiments, self.senti_labels))
        self.loss /= tf.to_float(batch_size)

        tf.scalar_summary("entropy_loss", self.loss)
        self.summary_op = tf.merge_all_summaries()

        # weight decay
        tvars = tf.trainable_variables()
        for v in tvars:
            print("Trainable %s" % v.name)
        # optimization
        if config.op == "adam":
            print("Use Adam")
            optimizer = tf.train.AdamOptimizer(self.learning_rate)
        elif config.op == "rmsprop":
            print("Use RMSProp")
            optimizer = tf.train.RMSPropOptimizer(self.learning_rate)
        else:
            print("Use SGD")
            optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)

        grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                          config.grad_clip)
        self.train_ops = optimizer.apply_gradients(zip(grads, tvars))
        self.saver = tf.train.Saver(tf.all_variables(),
                                    write_version=tf.train.SaverDef.V2)

        if log_dir is not None:
            train_log_dir = os.path.join(log_dir, "train")
            print("Save summary to %s" % log_dir)
            self.train_summary_writer = tf.train.SummaryWriter(
                train_log_dir, sess.graph)