Esempio n. 1
0
def get_start_end_seq_mask(seq_len=5, length=3):
  '''

  :param seq_len: n_ctx
  :param length: ans_avg_len
  :return:
  '''
  s = tf.constant(np.array(range(seq_len)))  # [0, 1, ..., seq_len-1]
  s = tf.expand_dims(s, axis=-1)  # [[0], [1], ..., [seq_len-1]]
  s = tf.tile(s, [1, length])  # [[0, 0, 0], [1, 1, 1], ..., [seq_len-1, seq_len-1, seq_len-1]]
  s = tf.concat(tf.unstack(s, axis=0), axis=0)  # [0, 0, 0, 1, 1, 1, 2, 2, 2, ..., 4, 4, 4]

  gap = tf.constant(np.array(range(length)))  # [0, 1, 2]
  gap = tf.tile(gap, [seq_len])  # [0, 1, 2, 0, 1, 2, ..., 0, 1, 2]

  e = s + gap  # [0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5,  ... ]
  s_mask = tf.cast(tf.sequence_mask(s+1, seq_len, dtype=tf.int32), tf.float32)  #
  s_mask_ = tf.cast(tf.sequence_mask(s, seq_len, dtype=tf.int32), tf.float32)
  s_mask = s_mask - s_mask_

  e_mask = tf.cast(tf.sequence_mask(e + 1, seq_len, dtype=tf.int32), tf.float32)
  e_mask_ = tf.cast(tf.sequence_mask(e, seq_len, dtype=tf.int32), tf.float32)
  e_mask = e_mask - e_mask_

  #res = e_mask - s_mask
  res = e_mask + s_mask
  res = res / tf.reduce_sum(res, axis=-1, keepdims=True)
  res = 2.0 * res
  return res
Esempio n. 2
0
def sovle_problem_1():
    input = tf.constant([range(5),
                         np.array(range(5)) + 1,
                         np.array(range(5)) + 2])
    '''
        input = [ [0, 1, 2, 3, 4],
                  [1, 2, 3, 4, 5],
                  [2, 3, 4, 5, 6]
                ]
    '''
    input = tf.constant([
        [0.99, 0.8, 0.7, 0.5, 0.5],
        [0.2, 0.3, 0.6, 0.7, 0.8],
        [0.1, 0.1, 0.1, 0.5, 1]
    ])
    sess = tf.Session()
    mask = tf.cast(tf.cast(tf.greater(input, 3), tf.int32), tf.float32)


    start_label = tf.constant(np.array([0, 2, 3]))
    start_label = tf.sequence_mask(start_label, 5, dtype=tf.int32) # not include the index

    end_label = tf.constant(np.array([2, 4, 3]))
    end_label = tf.sequence_mask(end_label+1, 5, dtype=tf.int32)

    res = end_label - start_label

    log_loss = tf.losses.log_loss(res, input)

    print sess.run([mask, start_label, end_label, res, log_loss])
Esempio n. 3
0
def pad_with_identity(x, sequence_length, max_sequence_length, identity_values=0):
  """Pads a tensor with identity values up to :obj:`max_sequence_length`.

  Args:
    x: A ``tf.Tensor`` of shape ``[batch_size, max(sequence_length), depth]``.
    sequence_length: The true sequence length of :obj:`x`.
    max_sequence_length: The sequence length up to which the tensor must contain
      :obj:`identity values`.
    identity_values: The identity value.

  Returns:
    A ``tf.Tensor`` of shape ``[batch_size, max(max_sequence_length), depth]``.
  """
  maxlen = tf.reduce_max(max_sequence_length)

  mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype)
  mask = tf.expand_dims(mask, axis=-1)
  mask_combined = tf.sequence_mask(max_sequence_length, dtype=x.dtype)
  mask_combined = tf.expand_dims(mask_combined, axis=-1)

  identity_mask = mask_combined * (1.0 - mask)

  x = pad_in_time(x, maxlen - tf.shape(x)[1])
  x = x * mask + (identity_mask * identity_values)

  return x
Esempio n. 4
0
def sequence_mask(input_lengths, max_len=None, expand=True):
	if max_len is None:
		max_len = tf.reduce_max(input_lengths)

	if expand:
		return tf.expand_dims(tf.sequence_mask(input_lengths, max_len, dtype=tf.float32), axis=-1)
	return tf.sequence_mask(input_lengths, max_len, dtype=tf.float32)
Esempio n. 5
0
def sequence_mask(lengths, r, expand=True):
	'''Returns a 2-D or 3-D tensorflow sequence mask depending on the argument 'expand'
	'''
	max_len = tf.reduce_max(lengths)
	max_len = _round_up_tf(max_len, tf.convert_to_tensor(r))
	if expand:
		return tf.expand_dims(tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32), axis=-1)
	return tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32)
Esempio n. 6
0
  def testNormal(self):
    with self.test_session():
      res = tf.sequence_mask(tf.constant([1, 3, 2]), 5)
      self.assertAllEqual(res.get_shape(), [3, 5])
      self.assertAllEqual(res.eval(), [[True, False, False, False, False],
                                       [True, True, True, False, False],
                                       [True, True, False, False, False]])

      # test dtype and default maxlen:
      res = tf.sequence_mask(tf.constant([0, 1, 4]), dtype=tf.float32)
      self.assertAllEqual(res.get_shape().as_list(), [3, None])
      self.assertAllEqual(res.eval(), [[0.0, 0.0, 0.0, 0.0],
                                       [1.0, 0.0, 0.0, 0.0],
                                       [1.0, 1.0, 1.0, 1.0]])
Esempio n. 7
0
def attend(x, sequence_length=None, method="ave", context=None, feature_dim=None, mask_zero=False, maxlen=None,
           epsilon=1e-8, bn=True, training=False, seed=0, reuse=True, name="attend"):
    if method == "ave":
        if mask_zero:
            # None * step_dim
            mask = tf.sequence_mask(sequence_length, maxlen)
            mask = tf.reshape(mask, (-1, tf.shape(x)[1], 1))
            mask = tf.cast(mask, tf.float32)
            z = tf.reduce_sum(x * mask, axis=1)
            l = tf.reduce_sum(mask, axis=1)
            # in some cases especially in the early stages of training the sum may be almost zero
            z /= tf.cast(l + epsilon, tf.float32)
        else:
            z = tf.reduce_mean(x, axis=1)
    elif method == "sum":
        if mask_zero:
            # None * step_dim
            mask = tf.sequence_mask(sequence_length, maxlen)
            mask = tf.reshape(mask, (-1, tf.shape(x)[1], 1))
            mask = tf.cast(mask, tf.float32)
            z = tf.reduce_sum(x * mask, axis=1)
        else:
            z = tf.reduce_sum(x, axis=1)
    elif method == "max":
        if mask_zero:
            # None * step_dim
            mask = tf.sequence_mask(sequence_length, maxlen)
            mask = tf.expand_dims(mask, axis=-1)
            mask = tf.tile(mask, (1, 1, tf.shape(x)[2]))
            masked_data = tf.where(tf.equal(mask, tf.zeros_like(mask)),
                                   tf.ones_like(x) * -np.inf, x)  # if masked assume value is -inf
            z = tf.reduce_max(masked_data, axis=1)
        else:
            z = tf.reduce_max(x, axis=1)
    elif method == "attention":
        if context is not None:
            step_dim = tf.shape(x)[1]
            context = tf.expand_dims(context, axis=1)
            context = tf.tile(context, [1, step_dim, 1])
            y = tf.concat([x, context], axis=-1)
        else:
            y = x
        a = attention(y, feature_dim, sequence_length, mask_zero, maxlen, seed=seed)
        z = tf.reduce_sum(x * a, axis=1)
    if bn:
        # training=False has slightly better performance
        z = tf.layers.BatchNormalization()(z, training=False)
        # z = batch_normalization(z, training=training, name=name)
    return z
Esempio n. 8
0
def attention(x, feature_dim, sequence_length, mask_zero=False, maxlen=None, epsilon=1e-8, seed=0):
    input_shape = tf.shape(x)
    step_dim = input_shape[1]
    # feature_dim = input_shape[2]
    x = tf.reshape(x, [-1, feature_dim])
    """
    The last dimension of the inputs to `Dense` should be defined. Found `None`.

    cann't not use `tf.layers.Dense` here
    eij = tf.layers.Dense(1)(x)

    see: https://github.com/tensorflow/tensorflow/issues/13348
    workaround: specify the feature_dim as input
    """

    eij = tf.layers.Dense(1, activation=tf.nn.tanh, kernel_initializer=tf.glorot_uniform_initializer(seed=seed),
                          dtype=tf.float32, bias_initializer=tf.zeros_initializer())(x)
    eij = tf.reshape(eij, [-1, step_dim])
    a = tf.exp(eij)

    # apply mask after the exp. will be re-normalized next
    if mask_zero:
        # None * step_dim
        mask = tf.sequence_mask(sequence_length, maxlen)
        mask = tf.cast(mask, tf.float32)
        a = a * mask

    # in some cases especially in the early stages of training the sum may be almost zero
    a /= tf.cast(tf.reduce_sum(a, axis=1, keep_dims=True) + epsilon, tf.float32)

    a = tf.expand_dims(a, axis=-1)
    return a
Esempio n. 9
0
 def check_dtypes(lengths_dtype, maxlen_dtype):
   res = tf.sequence_mask(tf.constant([1, 3, 2], dtype=lengths_dtype),
                          tf.constant(5, dtype=maxlen_dtype))
   self.assertAllEqual(res.get_shape(), [3, 5])
   self.assertAllEqual(res.eval(), [[True, False, False, False, False],
                                    [True, True, True, False, False],
                                    [True, True, False, False, False]])
Esempio n. 10
0
def _mask_by_length(t, length):
  """Mask t, 3-D [batch, time, dim], by length, 1-D [batch,]."""
  maxlen = t.get_shape().as_list()[1]
  mask = tf.sequence_mask(length, maxlen=maxlen)
  mask = tf.expand_dims(tf.cast(mask, tf.float32), -1)
  # shape(mask) = (batch, num_timesteps, 1)
  return t * mask
    def calculate_outputs(self, x):
        h = lstm_layer(x, self.history_length, self.lstm_size, scope='lstm-1')
        h = tf.concat([h, x], axis=2)
        h_final = time_distributed_dense_layer(h, 50, activation=tf.nn.relu, scope='dense-1')

        n_components = 1
        params = time_distributed_dense_layer(h_final, n_components*2, scope='dense-2', activation=None)
        ps, mixing_coefs = tf.split(params, 2, axis=2)

        # this is implemented incorrectly, but it still helped...
        mixing_coefs = tf.nn.softmax(mixing_coefs - tf.reduce_min(mixing_coefs, 2, keep_dims=True))
        ps = tf.nn.sigmoid(ps)

        labels = tf.tile(tf.expand_dims(self.next_is_ordered, 2), (1, 1, n_components))
        losses = tf.reduce_sum(mixing_coefs*log_loss(labels, ps), axis=2)
        sequence_mask = tf.cast(tf.sequence_mask(self.history_length, maxlen=100), tf.float32)
        avg_loss = tf.reduce_sum(losses*sequence_mask) / tf.cast(tf.reduce_sum(self.history_length), tf.float32)

        final_temporal_idx = tf.stack([tf.range(tf.shape(self.history_length)[0]), self.history_length - 1], axis=1)
        self.final_states = tf.gather_nd(h_final, final_temporal_idx)

        self.prediction_tensors = {
            'user_ids': self.user_id,
            'product_ids': self.product_id,
            'final_states': self.final_states
        }

        return avg_loss
Esempio n. 12
0
  def get_mention_emb(self, text_emb, text_outputs, mention_starts, mention_ends):
    mention_emb_list = []

    mention_start_emb = tf.gather(text_outputs, mention_starts) # [num_mentions, emb]
    mention_emb_list.append(mention_start_emb)

    mention_end_emb = tf.gather(text_outputs, mention_ends) # [num_mentions, emb]
    mention_emb_list.append(mention_end_emb)

    mention_width = 1 + mention_ends - mention_starts # [num_mentions]
    if self.config["use_features"]:
      mention_width_index = mention_width - 1 # [num_mentions]
      mention_width_emb = tf.gather(tf.get_variable("mention_width_embeddings", [self.config["max_mention_width"], self.config["feature_size"]]), mention_width_index) # [num_mentions, emb]
      mention_width_emb = tf.nn.dropout(mention_width_emb, self.dropout)
      mention_emb_list.append(mention_width_emb)

    if self.config["model_heads"]:
      mention_indices = tf.expand_dims(tf.range(self.config["max_mention_width"]), 0) + tf.expand_dims(mention_starts, 1) # [num_mentions, max_mention_width]
      mention_indices = tf.minimum(util.shape(text_outputs, 0) - 1, mention_indices) # [num_mentions, max_mention_width]
      mention_text_emb = tf.gather(text_emb, mention_indices) # [num_mentions, max_mention_width, emb]
      self.head_scores = util.projection(text_outputs, 1) # [num_words, 1]
      mention_head_scores = tf.gather(self.head_scores, mention_indices) # [num_mentions, max_mention_width, 1]
      mention_mask = tf.expand_dims(tf.sequence_mask(mention_width, self.config["max_mention_width"], dtype=tf.float32), 2) # [num_mentions, max_mention_width, 1]
      mention_attention = tf.nn.softmax(mention_head_scores + tf.log(mention_mask), dim=1) # [num_mentions, max_mention_width, 1]
      mention_head_emb = tf.reduce_sum(mention_attention * mention_text_emb, 1) # [num_mentions, emb]
      mention_emb_list.append(mention_head_emb)

    mention_emb = tf.concat(mention_emb_list, 1) # [num_mentions, emb]
    return mention_emb
Esempio n. 13
0
def _create_position_embedding(embedding_dim, num_positions, lengths, maxlen):
  """Creates position embeddings.

  Args:
    embedding_dim: Dimensionality of the embeddings. An integer.
    num_positions: The number of positions to be embedded. For example,
      if you have inputs of length up to 100, this should be 100. An integer.
    lengths: The lengths of the inputs to create position embeddings for.
      An int32 tensor of shape `[batch_size]`.
    maxlen: The maximum length of the input sequence to create position
      embeddings for. An int32 tensor.

  Returns:
    A tensor of shape `[batch_size, maxlen, embedding_dim]` that contains
    embeddings for each position. All elements past `lengths` are zero.
  """
  # Create constant position encodings
  position_encodings = tf.constant(
      position_encoding(num_positions, embedding_dim),
      name="position_encoding")

  # Slice to size of current sequence
  pe_slice = position_encodings[:maxlen, :]
  # Replicate encodings for each element in the batch
  batch_size = tf.shape(lengths)[0]
  pe_batch = tf.tile([pe_slice], [batch_size, 1, 1])

  # Mask out positions that are padded
  positions_mask = tf.sequence_mask(
      lengths=lengths, maxlen=maxlen, dtype=tf.float32)
  positions_embed = pe_batch * tf.expand_dims(positions_mask, 2)

  return positions_embed
Esempio n. 14
0
  def reduce_sequence(self, inputs, sequence_lengths):
    axis = self.axis % inputs[0].shape.ndims

    if axis == 2:
      padded, combined_length = pad_n_with_identity(inputs, sequence_lengths)
      return self.reduce(padded), combined_length
    elif axis == 1:
      # Pad all input tensors up to maximum combined length.
      combined_length = tf.add_n(sequence_lengths)
      maxlen = tf.reduce_max(combined_length)
      padded = [pad_in_time(x, maxlen - tf.shape(x)[1]) for x in inputs]

      current_length = None
      accumulator = None

      for elem, length in zip(padded, sequence_lengths):
        # Make sure padding are 0 vectors as it is required for the next step.
        mask = tf.sequence_mask(length, maxlen=maxlen, dtype=elem.dtype)
        elem = elem * tf.expand_dims(mask, -1)

        if accumulator is None:
          accumulator = elem
          current_length = length
        else:
          accumulator += roll_sequence(elem, current_length)
          current_length += length

      return accumulator, combined_length
    else:
      raise ValueError("Unsupported concatenation on axis {}".format(axis))
Esempio n. 15
0
    def prepare_train_eval(
        self, t_out,
        out_seq_len, labels, lr,
        train_op=None, loss=None
    ):
        if not loss:
            weights = tf.sequence_mask(
                out_seq_len,
                dtype=t_out.dtype
            )
            loss = tf.contrib.seq2seq.sequence_loss(
                t_out,
                labels,
                weights,
                average_across_batch=self.average_across_batch,
            )

        if not train_op:
            train_op = tf.contrib.layers.optimize_loss(
                loss,
                tf.train.get_global_step(),
                optimizer='SGD',
                learning_rate=lr,
                summaries=['loss', 'learning_rate']
            )

        return tf.estimator.EstimatorSpec(
            mode=self.mode,
            loss=loss,
            train_op=train_op,
        )
Esempio n. 16
0
    def call(self, inputs, **kwargs):
        query_key_keylen_list = inputs
        queries, keys, keys_length = query_key_keylen_list
        hist_len = keys.get_shape()[1]

        attention_score = LocalActivationUnit(
            self.hidden_size, self.activation, 0, 1, False, 1024,)([queries, keys])

        outputs = tf.transpose(attention_score, (0, 2, 1))

        key_masks = tf.sequence_mask(keys_length, hist_len)

        if self.weight_normalization:
            paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
        else:
            paddings = tf.zeros_like(outputs)

        outputs = tf.where(key_masks, outputs, paddings)

        if self.weight_normalization:
            outputs = tf.nn.softmax(outputs)

        outputs = tf.matmul(outputs, keys)

        return outputs
Esempio n. 17
0
  def _compute_metrics(self, features, labels, predictions):
    length = self._get_features_length(features)
    weights = tf.sequence_mask(
        length, maxlen=tf.shape(labels["tags"])[1], dtype=tf.float32)

    eval_metric_ops = {}
    eval_metric_ops["accuracy"] = tf.metrics.accuracy(
        labels["tags"], predictions["tags"], weights=weights)

    if self.tagging_scheme in ("bioes",):
      flag_fn = None
      if self.tagging_scheme == "bioes":
        flag_fn = flag_bioes_tags

      gold_flags, predicted_flags = tf.py_func(
          flag_fn,
          [labels["tags"], predictions["tags"], length],
          [tf.bool, tf.bool],
          stateful=False)

      precision_metric = tf.metrics.precision(gold_flags, predicted_flags)
      recall_metric = tf.metrics.recall(gold_flags, predicted_flags)

      precision = precision_metric[0]
      recall = recall_metric[0]
      f1 = (2 * precision * recall) / (recall + precision)

      eval_metric_ops["precision"] = precision_metric
      eval_metric_ops["recall"] = recall_metric
      eval_metric_ops["f1"] = (f1, tf.no_op())

    return eval_metric_ops
def mkMask(input_tensor, maxLen):
    shape_of_input = tf.shape(input_tensor)
    shape_of_output = tf.concat(axis=0, values=[shape_of_input, [maxLen]])

    oneDtensor = tf.reshape(input_tensor, shape=(-1,))
    flat_mask = tf.sequence_mask(oneDtensor, maxlen=maxLen)
    return tf.reshape(flat_mask, shape_of_output)
Esempio n. 19
0
def cross_entropy_sequence_loss(logits,
                                labels,
                                sequence_length,
                                label_smoothing=0.0,
                                average_in_time=False,
                                mode=tf.estimator.ModeKeys.TRAIN):
  """Computes the cross entropy loss of sequences.

  Args:
    logits: The unscaled probabilities.
    labels: The true labels.
    sequence_length: The length of each sequence.
    label_smoothing: The label smoothing value.
    average_in_time: If ``True``, also average the loss in the time dimension.
    mode: A ``tf.estimator.ModeKeys`` mode.

  Returns:
    A tuple (cumulated loss, loss normalizer, token-level normalizer).
  """
  batch_size = tf.shape(logits)[0]
  max_time = tf.shape(logits)[1]

  cross_entropy = _softmax_cross_entropy(logits, labels, label_smoothing, mode)
  weights = tf.sequence_mask(
      sequence_length, maxlen=max_time, dtype=cross_entropy.dtype)
  loss = tf.reduce_sum(cross_entropy * weights)
  loss_token_normalizer = tf.reduce_sum(weights)

  if average_in_time or mode != tf.estimator.ModeKeys.TRAIN:
    loss_normalizer = loss_token_normalizer
  else:
    loss_normalizer = tf.cast(batch_size, loss.dtype)

  return loss, loss_normalizer, loss_token_normalizer
Esempio n. 20
0
def attention(queries, keys, keys_length):
  '''
    queries:     [B, H]
    keys:        [B, T, H]
    keys_length: [B]
  '''
  queries_hidden_units = queries.get_shape().as_list()[-1]
  queries = tf.tile(queries, [1, tf.shape(keys)[1]])
  queries = tf.reshape(queries, [-1, tf.shape(keys)[1], queries_hidden_units])
  din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
  d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att')
  d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att')
  d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att')
  d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(keys)[1]])
  outputs = d_layer_3_all 
  # Mask
  key_masks = tf.sequence_mask(keys_length, tf.shape(keys)[1])   # [B, T]
  key_masks = tf.expand_dims(key_masks, 1) # [B, 1, T]
  paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
  outputs = tf.where(key_masks, outputs, paddings)  # [B, 1, T]

  # Scale
  outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)

  # Activation
  outputs = tf.nn.softmax(outputs)  # [B, 1, T]

  # Weighted sum
  outputs = tf.matmul(outputs, keys)  # [B, 1, H]

  return outputs
Esempio n. 21
0
  def create_variables_for_optimization(self):
    with tf.name_scope("optimization"):
      with tf.name_scope("masker"):
          self.mask = tf.sequence_mask(self.seq_len, self.num_step)
          self.mask = tf.reshape(tf.cast(self.mask, tf.float32), (-1,))
      if self.loss_function == "cross_entropy":
        self.pl_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                                            logits=self.logit,
                                            labels=self.actions_flatten)
      elif self.loss_function == "l2":
        self.one_hot_actions = tf.one_hot(self.actions_flatten, self.num_actions)
        self.pl_loss = tf.reduce_mean((self.probs - self.one_hot_actions) ** 2,
                                            axis=1)
      else:
          raise ValueError("loss function type is not defined")

      self.pl_loss = tf.multiply(self.pl_loss, self.mask)
      self.pl_loss = tf.reduce_mean(tf.multiply(self.pl_loss, self.returns_flatten))

      self.entropy = tf.multiply(self.entropy, self.mask)
      self.entropy = tf.reduce_mean(self.entropy)

      self.loss = self.pl_loss - self.entropy_bonus * self.entropy

      self.trainable_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="policy_network")
      self.gradients = self.optimizer.compute_gradients(self.loss, var_list=self.trainable_variables)
      self.clipped_gradients = [(tf.clip_by_norm(grad, self.max_gradient), var)
                                  for grad, var in self.gradients]
      self.train_op = self.optimizer.apply_gradients(self.clipped_gradients,
                                                     self.global_step)
      self.grad_norm = tf.global_norm([grad for grad, var in self.gradients])
      self.var_norm = tf.global_norm(self.trainable_variables)
Esempio n. 22
0
def make_positions(sequence_length, maximum_length=None):
  """Builds a sequence of positions.

  The first position is 1 as the 0 index is reserved to padding positions.

  Args:
    sequence_length: The length of each sequence as a ``tf.Tensor`` of shape
      :math:`[B]`.
    maximum_length: Optional size of the returned time dimension. Otherwise it
      is the maximum of :obj:`sequence_length`.

  Returns:
    The sequence of positions as a ``tf.Tensor`` of shape :math:`[B, T]`.
  """
  if maximum_length is None:
    maximum_length = tf.reduce_max(sequence_length)

  batch_size = tf.shape(sequence_length)[0]

  # Make 0 the position of padding.
  position = tf.range(maximum_length) + 1
  position = tf.tile(position, [batch_size])
  position = tf.reshape(position, [batch_size, -1])

  mask = tf.sequence_mask(
      sequence_length, maxlen=maximum_length, dtype=position.dtype)

  position = position * mask

  return position
Esempio n. 23
0
    def NLL(self, y, lengths, pis, mus, sigmas, rho, es, eps=1e-8):
        sigma_1, sigma_2 = tf.split(sigmas, 2, axis=2)
        y_1, y_2, y_3 = tf.split(y, 3, axis=2)
        mu_1, mu_2 = tf.split(mus, 2, axis=2)

        norm = 1.0 / (2*np.pi*sigma_1*sigma_2 * tf.sqrt(1 - tf.square(rho)))
        Z = tf.square((y_1 - mu_1) / (sigma_1)) + \
            tf.square((y_2 - mu_2) / (sigma_2)) - \
            2*rho*(y_1 - mu_1)*(y_2 - mu_2) / (sigma_1*sigma_2)

        exp = -1.0*Z / (2*(1 - tf.square(rho)))
        gaussian_likelihoods = tf.exp(exp) * norm
        gmm_likelihood = tf.reduce_sum(pis * gaussian_likelihoods, 2)
        gmm_likelihood = tf.clip_by_value(gmm_likelihood, eps, np.inf)

        bernoulli_likelihood = tf.squeeze(tf.where(tf.equal(tf.ones_like(y_3), y_3), es, 1 - es))

        nll = -(tf.log(gmm_likelihood) + tf.log(bernoulli_likelihood))
        sequence_mask = tf.logical_and(
            tf.sequence_mask(lengths, maxlen=tf.shape(y)[1]),
            tf.logical_not(tf.is_nan(nll)),
        )
        nll = tf.where(sequence_mask, nll, tf.zeros_like(nll))
        num_valid = tf.reduce_sum(tf.cast(sequence_mask, tf.float32), axis=1)

        sequence_loss = tf.reduce_sum(nll, axis=1) / tf.maximum(num_valid, 1.0)
        element_loss = tf.reduce_sum(nll) / tf.maximum(tf.reduce_sum(num_valid), 1.0)
        return sequence_loss, element_loss
Esempio n. 24
0
def crossentropy(logits, targets, sequence_length):
    """ Computes cross entropy loss of a batch of data. (Not averaged by batch_size)

    The final loss is averaged by the number of samples in the batch.

    Args:
        logits: The logits Tensor with shape [timesteps, batch_size, vocab_size].
        targets: The gold labels Tensor with shape [timesteps, batch_size].
        sequence_length: The length of `targets`, [batch_size, ]

    Returns: Loss sum and weight sum.
    """
    # [timesteps, batch_size]
    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=targets)

    # [timesteps, batch_size]
    loss_mask = tf.transpose(
        tf.sequence_mask(
            lengths=tf.to_int32(sequence_length),
            maxlen=tf.to_int32(tf.shape(targets)[0]),
            dtype=tf.float32), [1, 0])

    losses = losses * loss_mask
    loss_sum = tf.reduce_sum(losses)
    return loss_sum, tf.to_float(tf.shape(sequence_length)[0])
Esempio n. 25
0
def smoothing_crossentropy_avgall(logits, targets, sequence_length):
    """ Computes cross entropy loss of a batch of data with label smoothing.

    The final loss is averaged by the length of each
    sequence and then averaged by the batch size.

    Args:
        logits: The logits Tensor with shape [timesteps, batch_size, vocab_size].
        targets: The gold labels Tensor with shape [timesteps, batch_size].
        sequence_length: The length of `targets`, [batch_size, ]

    Returns: Loss sum and weight sum.
    """
    soft_targets, normalizing = label_smoothing(targets, logits.get_shape().as_list()[-1])
    losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=soft_targets) - normalizing
    # [timesteps, batch_size]
    loss_mask = tf.transpose(
        tf.sequence_mask(
            lengths=tf.to_int32(sequence_length),
            maxlen=tf.to_int32(tf.shape(targets)[0]),
            dtype=tf.float32), [1, 0])
    losses = losses * loss_mask
    # average loss
    avg_length = tf.to_float(sequence_length)
    loss_by_time = tf.reduce_sum(losses, axis=0) / avg_length
    loss_sum = tf.reduce_sum(loss_by_time)
    return loss_sum, tf.to_float(tf.shape(sequence_length)[0])
def filter(predictions, actual_lengths):
    # predictions: batch_size * max_time_steps * num_classes
    # actual_lengths: list of actual sequence length in a batch
    max_length = tf.shape(predictions)[1]
    mask = tf.sequence_mask(actual_lengths, max_length, dtype=tf.bool)
    predictions_cls = tf.argmax(predictions, 2, name='predictions_cls')
    invalid_cls = tf.zeros(shape=tf.shape(predictions_cls), dtype=tf.int64) - 1
    return tf.where(mask, predictions_cls, invalid_cls, name='filter_predictions_cls')
Esempio n. 27
0
 def softmax(inputs, length, max_length):
     inputs = tf.cast(inputs, tf.float32)
     max_axis = tf.reduce_max(inputs, 2, keep_dims=True)
     inputs = tf.exp(inputs - max_axis)
     length = tf.reshape(length, [-1])
     mask = tf.reshape(tf.cast(tf.sequence_mask(length, max_length), tf.float32), tf.shape(inputs))
     inputs *= mask
     _sum = tf.reduce_sum(inputs, reduction_indices=2, keep_dims=True) + 1e-9
     return inputs / _sum
Esempio n. 28
0
def BOW_encoder(ids_, ns_, V, embed_dim, hidden_dims, dropout_rate=0,
                is_training=None,
                **unused_kw):
    """Construct a bag-of-words encoder.

    You don't need to define any variables directly in this function, but you
    should:
        - Build the embeddings (using embedding_layer(...))
        - Apply the mask to zero-out padding indices, and sum the embeddings
            for each example
        - Build a stack of hidden layers (using fully_connected_layers(...))

    Note that this function returns the final encoding h_ as well as the masked
    embeddings xs_. The latter is used for L2 regularization, so that we can
    penalize the norm of only those vectors that were actually used for each
    example.

    Args:
        ids_: [batch_size, max_len] Tensor of int32, integer ids
        ns_:  [batch_size] Tensor of int32, (clipped) length of each sequence
        V: (int) vocabulary size
        embed_dim: (int) embedding dimension
        hidden_dims: list(int) dimensions of the output of each layer
        dropout_rate: (float) rate to use for dropout
        is_training: (bool) if true, is in training mode

    Returns: (h_, xs_)
        h_: [batch_size, hidden_dims[-1]] Tensor of float32, the activations of
            the last layer constructed by this function.
        xs_: [batch_size, max_len, embed_dim] Tensor of float32, the per-word
            embeddings as returned by embedding_layer and with the mask applied
            to zero-out the pad indices.
    """
    assert is_training is not None, "is_training must be explicitly set to True or False"
    # Embedding layer should produce:
    #   xs_: [batch_size, max_len, embed_dim]
    with tf.variable_scope("Embedding_Layer"):
        #### YOUR CODE HERE ####
        xs_ = None  # replace with a call to embedding_layer
        #### END(YOUR CODE) ####

    #### YOUR CODE HERE ####
    # Mask off the padding indices with zeros
    #   mask_: [batch_size, max_len, 1] with values of 0.0 or 1.0
    mask_ = tf.expand_dims(tf.sequence_mask(ns_, xs_.shape[1],
                                            dtype=tf.float32), -1)
    # Multiply xs_ by the mask to zero-out pad indices.


    # Sum embeddings: [batch_size, max_len, embed_dim] -> [batch_size, embed_dim]


    # Build a stack of fully-connected layers


    #### END(YOUR CODE) ####
    return h_, xs_
Esempio n. 29
0
def _mask_by_length(t, length):
  """Mask t, 3-D [batch, time, dim], by length, 1-D [batch,]."""
  maxlen = t.get_shape().as_list()[1]

  # Subtract 1 from length to prevent the perturbation from going on 'eos'
  mask = tf.sequence_mask(length - 1, maxlen=maxlen)
  mask = tf.expand_dims(tf.cast(mask, tf.float32), -1)
  # shape(mask) = (batch, num_timesteps, 1)
  return t * mask
Esempio n. 30
0
def define_computation_graph(source_vocab_size: int, target_vocab_size: int, batch_size: int):

    tf.reset_default_graph()

    # Placeholders for inputs and outputs
    encoder_inputs = tf.placeholder(shape=(batch_size, None), dtype=tf.int32, name='encoder_inputs')

    decoder_targets = tf.placeholder(shape=(batch_size, None), dtype=tf.int32, name='decoder_targets')
    decoder_inputs = tf.placeholder(shape=(batch_size, None), dtype=tf.int32, name='decoder_inputs')

    with tf.variable_scope("Embeddings"):
        source_embedding = tf.get_variable('source_embedding', [source_vocab_size, C.EMBEDDING_SIZE])
        target_embedding = tf.get_variable('target_embedding', [source_vocab_size, C.EMBEDDING_SIZE])

        encoder_inputs_embedded = tf.nn.embedding_lookup(source_embedding, encoder_inputs)
        decoder_inputs_embedded = tf.nn.embedding_lookup(target_embedding, decoder_inputs)

    with tf.variable_scope("Encoder"):
        encoder_cell = tf.contrib.rnn.LSTMCell(C.HIDDEN_SIZE)
        initial_state = encoder_cell.zero_state(batch_size, tf.float32)

        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(encoder_cell,
                                                                 encoder_inputs_embedded,
                                                                 initial_state=initial_state,
                                                                 dtype=tf.float32)

    with tf.variable_scope("Decoder"):
        decoder_cell = tf.contrib.rnn.LSTMCell(C.HIDDEN_SIZE)
        decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(decoder_cell,
                                                                 decoder_inputs_embedded,
                                                                 initial_state=encoder_final_state,
                                                                 dtype=tf.float32)

    with tf.variable_scope("Logits"):
        decoder_logits = tf.contrib.layers.linear(decoder_outputs, target_vocab_size)

    with tf.variable_scope("Loss"):
        one_hot_labels = tf.one_hot(decoder_targets, depth=target_vocab_size, dtype=tf.float32)
        stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
            labels=one_hot_labels,
            logits=decoder_logits)

        # mask padded positions
        target_lengths = compute_lengths(decoder_targets)
        target_weights = tf.sequence_mask(lengths=target_lengths, maxlen=None, dtype=decoder_logits.dtype)
        weighted_cross_entropy = stepwise_cross_entropy * target_weights
        loss = tf.reduce_mean(weighted_cross_entropy)

    with tf.variable_scope('Optimizer'):
        train_step = tf.train.AdamOptimizer(learning_rate=C.LEARNING_RATE).minimize(loss)

    # Logging of cost scalar (@tensorboard)
    tf.summary.scalar('loss', loss)
    summary = tf.summary.merge_all()

    return encoder_inputs, decoder_targets, decoder_inputs, loss, train_step, decoder_logits, summary
    def buildNetwork(self):
        with tf.name_scope('inputs'):
            self.getInputs()

        with tf.variable_scope('sampling', reuse=tf.AUTO_REUSE):
            """
			sample from the rnn for nSample times
			"""
            # ce_loss, rewards: [batch_size*n_samples]
            # probs, valid, skip_flag: [batch_size*n_samples, maxSteps]
            # n_corrects: single number, number of correct predictions in batch_size*n_samples

            ce_loss, rewards, predicted_skips, probs, valid, self.n_corrects, skip_flag, transfering_loss = self.get_loss_and_rewards(
            )

            # [batch_size, nSamples]
            ce_loss = tf.reshape(ce_loss,
                                 shape=[self.batch_size, self.n_samples],
                                 name='ce_loss')
            rewards = tf.reshape(rewards,
                                 shape=[self.batch_size, self.n_samples],
                                 name='rewards')
            transfering_loss = tf.reshape(
                transfering_loss,
                shape=[self.batch_size, self.n_samples],
                name='transfering_loss')

            # [batch_size, n_samples, maxSteps]
            probs = tf.reshape(
                probs,
                shape=[self.batch_size, self.n_samples, self.args.maxSteps],
                name='probs_ori')
            valid = tf.reshape(
                valid,
                shape=[self.batch_size, self.n_samples, self.args.maxSteps],
                name='valid_ori')
            skip_flag = tf.reshape(
                skip_flag,
                shape=[self.batch_size, self.n_samples, self.args.maxSteps],
                name='skip_flag_ori')
            predicted_skips = tf.reshape(
                predicted_skips,
                shape=[self.batch_size, self.n_samples, self.args.maxSteps],
                name='skip_flag_ori')

            probs = tf.add(probs, 1e-5, name='probs')
            valid = tf.cast(valid, tf.float32, name='valid_ori')

            # mask out steps exceeding the length of each sample
            # [batch_size, n_samples]
            length = tf.reshape(self.length,
                                shape=[self.batch_size, self.n_samples],
                                name='length')

            skip_flag_mask = tf.sequence_mask(lengths=length,
                                              maxlen=self.args.maxSteps,
                                              dtype=tf.float32,
                                              name='skip_flag_mask')
            # [batch_size, n_samples, maxSteps]
            skip_flag = tf.multiply(skip_flag,
                                    skip_flag_mask,
                                    name='skip_flag')
            # [batch_size, n_samples]
            n_skips = tf.reduce_sum(skip_flag, axis=-1, name='n_skips')
            # [batch_size, n_samples]
            skip_rate = tf.divide(n_skips,
                                  tf.cast(length, tf.float32),
                                  name='skip_rate')
            self.skip_rate = tf.reshape(skip_rate, shape=[-1])

            # [batch_size, n_samples]
            # number of valid decisions made in each sample
            # for sentence whose length <= min_read, n_valids would be 0
            n_valids = tf.reduce_sum(valid, axis=-1, name='n_valids')
            self.n_valids_sum = tf.reduce_sum(n_valids, name='n_valids_sum')

        with tf.name_scope('rewards'):
            # [batch_size, n_samples]

            sparse_rewards = tf.reduce_sum(predicted_skips,
                                           axis=-1,
                                           name='sparse_rewards')
            sparse_rewards = tf.multiply(self.args.sparse,
                                         tf.cast(sparse_rewards, tf.float32))
            rewards = tf.add(tf.cast(sparse_rewards, tf.float32),
                             tf.cast(rewards, tf.float32),
                             name='rewards')

        with tf.name_scope('pg_loss'):
            # [batch_size, ]
            rewards_mean, rewards_var = tf.nn.moments(rewards,
                                                      axes=-1,
                                                      name='rewards_moments')

            # [batch_size, 1]
            rewards_mean = tf.expand_dims(rewards_mean, axis=-1)
            # [batch_size, n_samples]
            rewards_mean = tf.tile(rewards_mean,
                                   multiples=[1, self.n_samples],
                                   name='rewards_mean')

            # [batch_size, n_samples]
            rewards_norm = tf.subtract(rewards,
                                       rewards_mean,
                                       name='rewards_norm')

            # [batch_size, n_samples, maxSteps]
            rewards_norm_tiled = tf.tile(tf.expand_dims(rewards_norm, axis=-1),
                                         multiples=[1, 1, self.args.maxSteps])
            # mask out steps that are not valid
            # [batch_size, n_samples, maxSteps]
            rewards_norm_tiled = tf.multiply(rewards_norm_tiled,
                                             valid,
                                             name='rewards_norm_tiled')
            rewards_norm_tiled = tf.stop_gradient(rewards_norm_tiled)
            # [batch_size, n_samples, maxSteps]
            pg_loss_ori = tf.multiply(rewards_norm_tiled,
                                      tf.log(probs),
                                      name='pg_loss_ori')

            ## each sampled sample average over its valid steps

            # [batch_size, n_samples]
            pg_loss_sum = tf.reduce_sum(pg_loss_ori,
                                        axis=-1,
                                        name='pg_loss_sum')
            # [batch_size, n_samples]
            # some n_valids is 0, resulting in nan in pg_loss_avg, replace nan with 0, the average over valid steps
            n_valids = tf.where(tf.equal(tf.cast(n_valids, tf.int32), 0),
                                tf.ones_like(n_valids) * 1e10,
                                n_valids,
                                name='n_valids_final')
            pg_loss_avg = tf.divide(pg_loss_sum, n_valids, name='pg_loss_avg')

            # average over samples
            # [batch_size]
            pg_loss = tf.reduce_mean(pg_loss_avg, axis=-1, name='pg')
            pg_loss = tf.subtract(0.0, pg_loss, name='pg_loss')
            #pg_loss = tf.Print(pg_loss, data=[tf.reduce_sum(pg_loss)])

        with tf.name_scope('gradients'):
            # average over samples
            # [batch_size]
            ce_loss = tf.reduce_mean(ce_loss, axis=-1, name='ce_loss')
            transfering_loss = tf.reduce_mean(transfering_loss,
                                              axis=-1,
                                              name='transfering_loss')

            # mask out transfering_loss and pg_loss
            is_transfering = tf.cast(self.is_transfering, tf.float32)
            # disable transfering_loss when RL begins
            transfering_loss = tf.multiply(is_transfering, transfering_loss)
            pg_loss = tf.multiply((1.0 - is_transfering), pg_loss)

            trainable_params = tf.trainable_variables()

            # ce_params = []
            # pg_params = []
            #
            # for param in trainable_params:
            # 	if param.name == 'sampling/loop/skip_lstm_cell/skip_kernel:0' \
            # 			or param.name == 'sampling/loop/skip_lstm_cell/skip_bias:0':
            # 		pg_params.append(param)
            # 	else:
            # 		ce_params.append(param)

            # TODO: should we use gradients from pg_loss for params other than skip_kernel and skip_bias?
            # Yes,lower level nets should also be optimized for prediction of skips

            # add sparse_loss
            # sparse_loss = tf.Print(sparse_loss, data=[tf.reduce_sum(sparse_loss)])

            # when testing upper bound, we only cares about ce_loss
            #self.loss = tf.reduce_sum(ce_loss + pg_loss + transfering_loss, name='loss')
            self.loss = tf.reduce_sum(ce_loss, name='loss')

            gradients_all = tf.gradients(self.loss, trainable_params)

            # gradients_ce = tf.gradients(ce_loss, ce_params)
            # gradients_pg = tf.gradients(pg_loss, pg_params)
            # gradients_sparse = tf.gradients(sparse_loss, trainable_params)

            opt = tf.train.AdamOptimizer(learning_rate=self.args.learningRate,
                                         beta1=0.9,
                                         beta2=0.999,
                                         epsilon=1e-08)

            # all_params = ce_params + pg_params
            # all_gradients = gradients_ce + gradients_pg

            self.optOp = opt.apply_gradients(
                zip(gradients_all, trainable_params))

            print('RL model built!')
Esempio n. 32
0
def build_network(is_training):

    train_output_embed,enc_state= encoder_net(image, 'encode_features',is_training)



#vocab_size: 输入数据的总词汇量,指的是总共有多少类词汇,不是总个数,embed_dim:想要得到的嵌入矩阵的维度

    embeddings = tf.get_variable(name='embed_matrix',shape=[4, 4])

    output_embed=embedding_ops.embedding_lookup(embeddings,train_output)



    start_tokens = tf.zeros([40], dtype=tf.int64)



    train_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(output_embed, train_length,

                                                                       embeddings, sample_rate)



    #用于inference阶段的helper,将output输出后的logits使用argmax获得id再经过embedding layer来获取下一时刻的输入。

    #start_tokens: batch中每个序列起始输入的token_id  end_token:序列终止的token_id

    #start_tokens: int32 vector shaped [batch_size], the start tokens.

    #end_token: int32 scalar, the token that marks end of decoding.

    pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, start_tokens=tf.to_int32(start_tokens), end_token=1)#GO,EOS的序号

    train_outputs = decode(train_helper, train_output_embed,'decode',enc_state)



    pred_outputs = decode(pred_helper, train_output_embed, 'decode',enc_state, reuse=True)

    train_decode_result = train_outputs[0].rnn_output[:, :-1, :]

    pred_decode_result = pred_outputs[0].rnn_output

    mask = tf.cast(tf.sequence_mask(40 * [train_length[0] - 1], train_length[0]),

                   tf.float32)

    att_loss = tf.contrib.seq2seq.sequence_loss(train_outputs[0].rnn_output, target_output,weights=mask)



    loss = tf.reduce_mean(att_loss)



    







    return loss,train_decode_result, pred_decode_result
def train_CRNN():
    print('Run CRNN chord recognition on %s-%d...' %
          (hp.dataset, hp.test_set_id))

    # Load training and testing data
    train_data, test_data = load_data_symbol(
        dir=hp.dataset + '_preprocessed_data_MIREX_Mm.pickle',
        test_set_id=hp.test_set_id,
        sequence_with_overlap=hp.train_sequence_with_overlap)
    n_train_sequences = train_data['pianoroll'].shape[0]
    n_test_sequences = test_data['pianoroll'].shape[0]
    n_iterations_per_epoch = int(math.ceil(n_train_sequences / hp.n_batches))
    print('n_train_sequences =', n_train_sequences)
    print('n_test_sequences =', n_test_sequences)
    print('n_iterations_per_epoch =', n_iterations_per_epoch)
    print(hp)

    with tf.name_scope('placeholder'):
        x_p = tf.placeholder(tf.int32, [None, hp.n_steps, 88],
                             name="pianoroll")
        x_len = tf.placeholder(tf.int32, [None], name="seq_lens")
        y_tc = tf.placeholder(tf.int32, [None, hp.n_steps], name="tchord")
        dropout = tf.placeholder(dtype=tf.float32, name="dropout_rate")
        is_training = tf.placeholder(dtype=tf.bool, name="is_training")
        global_step = tf.placeholder(dtype=tf.int32, name='global_step')

    with tf.name_scope('model'):
        x_in = tf.cast(x_p, tf.float32)
        source_mask = tf.sequence_mask(
            lengths=x_len, maxlen=hp.n_steps,
            dtype=tf.float32)  # [n_batches, n_steps]
        input_embed = crm.CRNN(x_in, x_len, dropout, is_training, hp)

    with tf.variable_scope("output_projection"):
        input_embed = tf.layers.dropout(input_embed,
                                        rate=dropout,
                                        training=is_training)
        chord_logits = tf.layers.dense(input_embed, hp.n_chord_classes)

    with tf.name_scope('loss'):
        loss = tf.losses.softmax_cross_entropy(onehot_labels=tf.one_hot(
            y_tc, hp.n_chord_classes),
                                               logits=chord_logits,
                                               weights=source_mask)
    valid = tf.reduce_sum(source_mask)
    summary_loss = tf.Variable(0.0, trainable=False, dtype=tf.float32)
    summary_valid = tf.Variable(0, trainable=False, dtype=tf.float32)
    update_loss = tf.assign(summary_loss, summary_loss + valid * loss)
    update_valid = tf.assign(summary_valid, summary_valid + valid)
    mean_loss = tf.assign(summary_loss, summary_loss / summary_valid)
    clr_summary_loss = summary_loss.initializer
    clr_summary_valid = summary_valid.initializer
    tf.summary.scalar('Loss_total', summary_loss)

    with tf.name_scope('evaluation'):
        chord_mask = tf.cast(source_mask, tf.bool)
        chord_mask = tf.logical_and(chord_mask,
                                    tf.less(y_tc, tquality_dict['O'] * 12))
        pred_tc = tf.argmax(chord_logits, axis=2, output_type=tf.int32)
        pred_tc_correct = tf.equal(pred_tc, y_tc)
        pred_tc_correct_mask = tf.boolean_mask(tensor=pred_tc_correct,
                                               mask=chord_mask)
        correct = tf.reduce_sum(tf.cast(pred_tc_correct_mask, tf.float32))
        total = tf.cast(tf.size(pred_tc_correct_mask), tf.float32)
    summary_count = tf.Variable([0.0 for _ in range(2)],
                                trainable=False,
                                dtype=tf.float32)
    summary_score = tf.Variable(0.0, trainable=False, dtype=tf.float32)
    update_count = tf.assign(summary_count, summary_count + [correct, total])
    acc_tc = summary_count[0] / summary_count[1]
    compute_score = tf.assign(summary_score, summary_score + acc_tc)
    clr_summary_count = summary_count.initializer
    clr_summary_score = summary_score.initializer
    tf.summary.scalar('Accuracy_tchord', summary_score)

    with tf.name_scope('optimization'):
        # Apply warm-up learning rate
        warm_up_steps = tf.constant(4000, dtype=tf.float32)
        gstep = tf.cast(global_step, dtype=tf.float32)
        learning_rate = pow(hp.input_embed_size, -0.5) * tf.minimum(
            tf.pow(gstep, -0.5), gstep * tf.pow(warm_up_steps, -1.5))
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           beta1=0.9,
                                           beta2=0.98,
                                           epsilon=1e-9)
        update_ops = tf.get_collection(
            tf.GraphKeys.UPDATE_OPS
        )  # update moving_mean and moving_variance of batch normalization
        train_op = optimizer.minimize(loss)
        train_op = tf.group([train_op, update_ops])
    # Graph location and summary writers
    print('Saving graph to: %s' % hp.graph_location)
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(hp.graph_location + '\\train')
    test_writer = tf.summary.FileWriter(hp.graph_location + '\\test')
    train_writer.add_graph(tf.get_default_graph())
    test_writer.add_graph(tf.get_default_graph())
    saver = tf.train.Saver(max_to_keep=1)

    # Training
    print('Train the model...')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        startTime = time.time()  # start time of training
        best_score = [0.0, 0.0]
        in_succession = 0
        best_epoch = 0
        for step in range(hp.n_training_steps):
            # Training
            if step == 0:
                indices = range(n_train_sequences)
                batch_indices = [
                    indices[x:x + hp.n_batches]
                    for x in range(0, len(indices), hp.n_batches)
                ]

            if step >= 2 * n_iterations_per_epoch and step % n_iterations_per_epoch == 0:
                # Shuffle training data
                indices = random.sample(range(n_train_sequences),
                                        n_train_sequences)
                batch_indices = [
                    indices[x:x + hp.n_batches]
                    for x in range(0, len(indices), hp.n_batches)
                ]

            batch = (
                train_data['pianoroll'][batch_indices[step %
                                                      len(batch_indices)]],
                train_data['len'][batch_indices[step % len(batch_indices)]],
                train_data['tchord'][batch_indices[step % len(batch_indices)]],
                train_data['root'][batch_indices[step % len(batch_indices)]],
                train_data['tquality'][batch_indices[step %
                                                     len(batch_indices)]])

            train_run_list = [
                train_op, update_valid, update_loss, update_count, loss,
                pred_tc, chord_mask
            ]
            train_feed_fict = {
                x_p: batch[0],
                x_len: batch[1],
                y_tc: batch[2],
                dropout: hp.drop,
                is_training: True,
                global_step: step + 1
            }
            _, _, _, _, train_loss, train_pred_tc, train_chord_mask = sess.run(
                train_run_list, feed_dict=train_feed_fict)
            if step == 0:
                print('*~ loss %.4f ~*' % (train_loss))

            # Display training log & Testing
            if step > 0 and step % n_iterations_per_epoch == 0:
                sess.run([mean_loss, compute_score])
                train_summary, train_loss, train_score = sess.run(
                    [merged, summary_loss, summary_score])
                sess.run([
                    clr_summary_valid, clr_summary_loss, clr_summary_count,
                    clr_summary_score
                ])
                train_writer.add_summary(train_summary, step)
                print(
                    "---- step %d, epoch %d: train_loss: %.4f, evaluation: tc %.4f ----"
                    % (step, step // n_iterations_per_epoch, train_loss,
                       train_score))
                display_len = 64
                print('len =', batch[1][0])
                print(
                    'y_root'.ljust(7, ' '),
                    ''.join([[k for k, v in root_dict.items()
                              if v == b][0].rjust(3, ' ')
                             for b in batch[3][0, :display_len]]))
                print(
                    'y_tq'.ljust(7, ' '),
                    ''.join([[k for k, v in tquality_dict.items()
                              if v == b][0].rjust(3, ' ')
                             for b in batch[4][0, :display_len]]))
                print(
                    'valid'.ljust(7, ' '), ''.join([
                        'y'.rjust(3, ' ') if b else 'n'.rjust(3, ' ')
                        for b in train_chord_mask[0, :display_len]
                    ]))
                print(
                    'y_tc'.ljust(7, ' '), ''.join([
                        str(b).rjust(3, ' ') for b in batch[2][0, :display_len]
                    ]))
                print(
                    'pred_tc'.ljust(7, ' '), ''.join([
                        str(b).rjust(3, ' ')
                        for b in train_pred_tc[0, :display_len]
                    ]))

                # Testing
                test_run_list = [
                    update_valid, update_loss, update_count, pred_tc,
                    chord_mask
                ]
                test_feed_fict = {
                    x_p: test_data['pianoroll'],
                    x_len: test_data['len'],
                    y_tc: test_data['tchord'],
                    dropout: 0.0,
                    is_training: False
                }
                _, _, _, test_pred_tc, test_chord_mask = sess.run(
                    test_run_list, feed_dict=test_feed_fict)
                sess.run([mean_loss, compute_score])
                test_summary, test_loss, test_score = sess.run(
                    [merged, summary_loss, summary_score])
                sess.run([
                    clr_summary_valid, clr_summary_loss, clr_summary_count,
                    clr_summary_score
                ])
                test_writer.add_summary(test_summary, step)

                sq = crm.segmentation_quality(test_data['tchord'],
                                              test_pred_tc, test_data['len'])
                print(
                    "==== step %d, epoch %d: test_loss: %.4f, evaluation: tc %.4f, sq %.4f ===="
                    % (step, step // n_iterations_per_epoch, test_loss,
                       test_score, sq))
                sample_id = random.randint(0, n_test_sequences - 1)
                print('len =', test_data['len'][sample_id])
                print(
                    'y_root'.ljust(7, ' '), ''.join(
                        [[k for k, v in root_dict.items()
                          if v == b][0].rjust(3, ' ')
                         for b in test_data['root'][sample_id, :display_len]]))
                print(
                    'y_tq'.ljust(7, ' '), ''.join([
                        [k for k, v in tquality_dict.items()
                         if v == b][0].rjust(3, ' ')
                        for b in test_data['tquality'][sample_id, :display_len]
                    ]))
                print(
                    'valid'.ljust(7, ' '), ''.join([
                        'y'.rjust(3, ' ') if b else 'n'.rjust(3, ' ')
                        for b in test_chord_mask[sample_id, :display_len]
                    ]))
                print(
                    'y_tc'.ljust(7, ' '), ''.join([
                        str(b).rjust(3, ' ')
                        for b in test_data['tchord'][sample_id, :display_len]
                    ]))
                print(
                    'pred_tc'.ljust(7, ' '), ''.join([
                        str(b).rjust(3, ' ')
                        for b in test_pred_tc[sample_id, :display_len]
                    ]))

                if step > 0 and test_score + sq > sum(best_score):
                    best_score = [test_score, sq]
                    best_epoch = step // n_iterations_per_epoch
                    in_succession = 0
                    # Save variables of the model
                    print('*saving variables...\n')
                    saver.save(
                        sess, hp.graph_location + '\\CRNN_chord_recognition_' +
                        hp.dataset + '_' + str(hp.test_set_id) + '.ckpt')
                else:
                    in_succession += 1
                    if in_succession > hp.n_in_succession:
                        print('Early stopping.')
                        break

        elapsed_time = time.time() - startTime
        print('\nCRNN chord symbol recognition on %s-%d:' %
              (hp.dataset, hp.test_set_id))
        print('training time = %.2f hr' % (elapsed_time / 3600))
        print('best epoch = ', best_epoch)
        print('best score =', np.round(best_score, 4))
Esempio n. 34
0
 def mask_logits(self, logits, sequence_lengths):
     mask = tf.sequence_mask(sequence_lengths, dtype=tf.float32)
     mask_value = -1e32
     return logits + mask_value * (1 - mask)
Esempio n. 35
0
    def compute_loss(self,
                     pred_dict,
                     gt_dict,
                     config,
                     is_eval,
                     is_nn,
                     P_in=None):
        '''
            Input:
                pred_dict should contain:
                    - W: BxNxK, segmentation parts. Allow zero rows to indicate unassigned points.
                    - nocs_per_point: BxNx3, nocs per point
                    - confi_per_point: type per points
                        - This should be logit of shape BxNxT if is_eval=False, and actual value of shape BxN otherwise
                        - can contain -1
                    - parameters - a dict, each entry is a BxKx... tensor
                gt_dict should be obtained from calling create_gt_dict
                P_in - BxNx3 is the input point cloud, used only when is_eval=True

            Returns: {loss_dict, matching_indices} + stats from calculate_eval_stats(), where
                - loss_dict contains:
                    - nocs_loss: B, averaged over all N points
                    - type_loss: B, averaged over all N points.
                        - This is cross entropy loss during training, and accuracy during test time
                    - miou_loss: BxK, mean IoU loss for each matched parts
                    - residue_loss: BxK, residue loss for each part
                    - parameter_loss: BxK, parameter loss for each part
                    - avg_miou_loss: B
                    - avg_residue_loss: B
                    - avg_parameter_loss: B
                - matching_indices: BxK, where (b,k)th ground truth primitive is matched with (b, matching_indices[b, k])
        '''
        # dimension tensors
        W = pred_dict['W']
        batch_size = tf.shape(W)[0]  # B*N*K(k parts)
        n_points = tf.shape(W)[1]
        n_max_parts = W.get_shape()[
            2]  # n_max_parts should not be dynamic, fixed number of parts
        # n_registered_primitives = fitter_factory.get_n_registered_primitives()

        if is_eval and is_nn:
            # at loss, want W to be binary and filtered (if is from nn)
            W = nn_filter_W(W)

        # note that I_gt can contain -1, indicating part of unknown primitive type
        I_gt = gt_dict['cls_per_point']  # BxN
        n_parts_gt = tf.reduce_max(
            I_gt, axis=1
        ) + 1  # only count known primitive type parts, as -1 will be ignored
        mask_gt = tf.sequence_mask(
            n_parts_gt, maxlen=n_max_parts
        )  # BxK, mask_gt[b, k] = 1 iff instace k is present in the ground truth batch b

        matching_indices = tf.stop_gradient(
            tf.py_func(hungarian_matching, [W, I_gt],
                       Tout=tf.int32))  # BxK into K parts
        # miou_loss = loss.compute_miou_loss(W, I_gt, matching_indices) # losses all have dimension BxK, here is for segmentation
        miou_loss = loss.compute_miou_loss(W, I_gt)
        nocs_loss = loss.compute_nocs_loss(pred_dict['nocs_per_point'], gt_dict['nocs_per_point'], pred_dict['confi_per_point'], \
                                        num_parts=n_max_parts, mask_array=gt_dict['mask_array_per_point'],  \
                                        TYPE_L=config.get_nocs_loss(), MULTI_HEAD=True, SELF_SU=False) # todo

        if self.is_mixed:
            gocs_loss = loss.compute_nocs_loss(pred_dict['gocs_per_point'], gt_dict['gocs_per_point'], pred_dict['confi_per_point'], \
                                        num_parts=n_max_parts, mask_array=gt_dict['mask_array_per_point'],  \
                                        TYPE_L=config.get_nocs_loss(), MULTI_HEAD=True, SELF_SU=False) # todo

        heatmap_loss = loss.compute_vect_loss(pred_dict['heatmap_per_point'], gt_dict['heatmap_per_point'], confidence=gt_dict['joint_cls_mask'],\
                                    TYPE_L=config.get_nocs_loss())
        unitvec_loss = loss.compute_vect_loss(pred_dict['unitvec_per_point'], gt_dict['unitvec_per_point'], confidence=gt_dict['joint_cls_mask'],\
                                    TYPE_L=config.get_nocs_loss())
        orient_loss  = loss.compute_vect_loss(pred_dict['joint_axis_per_point'], gt_dict['orient_per_point'], confidence=gt_dict['joint_cls_mask'],\
                                TYPE_L=config.get_nocs_loss())

        J_gt = gt_dict['index_per_point']  # BxN
        inds_pred = pred_dict['index_per_point']
        miou_joint_loss = loss.compute_miou_loss(
            inds_pred,
            J_gt)  # losses all have dimension BxK, here is for segmentation
        # here we need to add input GT masks for different array

        loss_dict = {
            'nocs_loss': nocs_loss,
            'miou_loss': miou_loss,
            'heatmap_loss': heatmap_loss,
            'unitvec_loss': unitvec_loss,
            'orient_loss': orient_loss,
            'index_loss': miou_joint_loss
        }

        if self.is_mixed:
            loss_dict['gocs_loss'] = gocs_loss

        result = {'loss_dict': loss_dict, 'matching_indices': matching_indices}
        """
        if is_eval:
            result.update(
                calculate_eval_stats(
                    W=W,
                    matching_indices=matching_indices,
                    mask_gt=mask_gt,
                    P_in=P_in,
                    confi_per_point=pred_dict['confi_per_point'],
                )
            )
        """
        return result
Esempio n. 36
0
 def _compute_logits(self, mfcc, mfcc_lens, training):
     logits = self.model(mfcc,
                         mask=tf.sequence_mask(mfcc_lens),
                         training=training)
     return tf.transpose(logits, [1, 0, 2])
Esempio n. 37
0
    def build_tagging_graph(self, inputs, hidden_layers, channels, num_tags,
                            use_crf, lamd, dropout_emb, dropout_hidden,
                            kernel_size, use_bn, use_wn, active_type):
        """
        Build a deep neural model for sequence tagging.
        """
        stag_ids = tf.placeholder(dtype=INT_TYPE,
                                  shape=[None, None],
                                  name='stag_ids')
        seq_lengths = tf.placeholder(dtype=INT_TYPE,
                                     shape=[None],
                                     name='seq_lengths')

        # Default is not train.
        is_train = tf.placeholder(dtype=tf.bool, shape=[], name='is_train')

        masks = tf.cast(tf.sequence_mask(seq_lengths), FLOAT_TYPE)

        # Dropout on embedding output.
        if dropout_emb:
            inputs = tf.cond(is_train,
                             lambda: tf.nn.dropout(inputs, 1 - dropout_emb),
                             lambda: inputs)

        hidden_output = inputs
        pre_channels = inputs.get_shape()[-1].value
        for i in xrange(hidden_layers):

            k = kernel_size
            cur_channels = channels[i]
            filter_w = tf.get_variable('filter_w_%d' % i,
                                       shape=[k, pre_channels, cur_channels],
                                       dtype=FLOAT_TYPE)
            filter_v = tf.get_variable('filter_v_%d' % i,
                                       shape=[k, pre_channels, cur_channels],
                                       dtype=FLOAT_TYPE)
            bias_b = tf.get_variable(
                'bias_b_%d' % i,
                shape=[cur_channels],
                initializer=tf.zeros_initializer(dtype=FLOAT_TYPE))
            bias_c = tf.get_variable(
                'bias_c_%d' % i,
                shape=[cur_channels],
                initializer=tf.zeros_initializer(dtype=FLOAT_TYPE))

            # Weight normalization.
            if use_wn:
                epsilon = 1e-12
                g_w = tf.get_variable('g_w_%d' % i,
                                      shape=[k, 1, cur_channels],
                                      dtype=FLOAT_TYPE)
                g_v = tf.get_variable('g_v_%d' % i,
                                      shape=[k, 1, cur_channels],
                                      dtype=FLOAT_TYPE)
                # Perform wn
                filter_w = g_w * filter_w / (tf.sqrt(
                    tf.reduce_sum(filter_w**2, 1, keep_dims=True)) + epsilon)
                filter_v = g_v * filter_v / (tf.sqrt(
                    tf.reduce_sum(filter_v**2, 1, keep_dims=True)) + epsilon)

            w = tf.nn.conv1d(hidden_output, filter_w, 1, 'SAME') + bias_b
            v = tf.nn.conv1d(hidden_output, filter_v, 1, 'SAME') + bias_c

            if use_bn:
                w = layers.batch_norm(inputs=v,
                                      decay=0.9,
                                      is_training=is_train,
                                      center=True,
                                      scale=True,
                                      scope='BatchNorm_w_%d' % i)
                v = layers.batch_norm(inputs=w,
                                      decay=0.9,
                                      is_training=is_train,
                                      center=True,
                                      scale=True,
                                      scope='BatchNorm_v_%d' % i)

            if active_type == 'glu':
                hidden_output = w * tf.nn.sigmoid(v)
            elif active_type == 'relu':
                hidden_output = tf.nn.relu(w)
            elif active_type == 'gtu':
                hidden_output = tf.tanh(w) * tf.nn.sigmoid(v)
            elif active_type == 'tanh':
                hidden_output = tf.tanh(w)
            elif active_type == 'linear':
                hidden_output = w
            elif active_type == 'bilinear':
                hidden_output = w * v

            # Mask paddings.
            hidden_output = hidden_output * tf.expand_dims(masks, -1)
            # Dropout on hidden output.
            if dropout_hidden:
                hidden_output = tf.cond(
                    is_train,
                    lambda: tf.nn.dropout(hidden_output, 1 - dropout_hidden),
                    lambda: hidden_output)

            pre_channels = cur_channels

        # Un-scaled log probabilities.
        scores = layers.fully_connected(hidden_output, num_tags, tf.identity)

        if use_crf:
            cost, transitions = crf.crf_log_likelihood(
                inputs=scores,
                tag_indices=stag_ids,
                sequence_lengths=seq_lengths)
            cost = -tf.reduce_mean(cost)
        else:
            reshaped_scores = tf.reshape(scores, [-1, num_tags])
            reshaped_stag_ids = tf.reshape(stag_ids, [-1])
            real_distribution = layers.one_hot_encoding(
                reshaped_stag_ids, num_tags)
            cost = tf.nn.softmax_cross_entropy_with_logits(
                reshaped_scores, real_distribution)
            cost = tf.reduce_sum(
                tf.reshape(cost, tf.shape(stag_ids)) * masks) / tf.cast(
                    tf.shape(inputs)[0], FLOAT_TYPE)

        # Calculate L2 penalty.
        l2_penalty = 0
        if lamd > 0:
            for v in tf.trainable_variables():
                if '/B:' not in v.name and '/biases:' not in v.name:
                    l2_penalty += lamd * tf.nn.l2_loss(v)
        train_cost = cost + l2_penalty

        # Summary cost.
        tf.summary.scalar('cost', cost)

        summaries = tf.summary.merge_all()

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        if update_ops:
            updates = tf.group(*update_ops)
            with tf.control_dependencies([updates]):
                cost = tf.identity(cost)

        return stag_ids, seq_lengths, is_train, cost, train_cost, scores, summaries
Esempio n. 38
0
import tensorflow as tf

a = tf.sequence_mask([1, 2, 3], 5)  #一维度的变成二维度的。
b = tf.sequence_mask([[1, 2], [3, 4]])  # 二维度的变成三维度的。

a = tf.cast(a, tf.float32)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(a))
    print(sess.run(b))
"""
[[1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0.]
 [1. 1. 1. 0. 0.]]

 解析:maxlen是5,所以一共有5列,lengths有三个元素[1,2,3],所以有三行,每一行分别前1、2、3个元素为True,经过了转型,那么就会变成1或者0.


[[[ True False False False]
  [ True  True False False]]

 [[ True  True  True False]
  [ True  True  True  True]]]

  解析:因为没有指定maxlen,故maxlen默认取lengths中的最大值4,所以一共有4列,lengths是二维数组,
  将其看作由两个一维lengths组成,所以输出也可以看作由这两个一维lengths的输出所组成
"""
    def __init__(self, batch, config, is_train=True, image_features=None):
        self.batch = batch
        self.config = config
        self.image_dir = config.image_dir
        self.is_train = is_train

        # word_weight_dir is only for answer accuracy visualization
        self.word_weight_dir = getattr(config, 'vlmap_word_weight_dir', None)
        if self.word_weight_dir is None:
            log.warn('word_weight_dir is None')

        self.losses = {}
        self.report = {}
        self.mid_result = {}
        self.output = {}
        self.heavy_output = {}
        self.vis_image = {}

        self.vocab = cPickle.load(open(config.vocab_path, 'rb'))
        self.answer_dict = cPickle.load(
            open(os.path.join(config.tf_record_dir, 'answer_dict.pkl'), 'rb'))
        self.num_answer = len(self.answer_dict['vocab'])
        self.num_train_answer = self.answer_dict['num_train_answer']
        self.train_answer_mask = tf.expand_dims(tf.sequence_mask(
            self.num_train_answer, maxlen=self.num_answer, dtype=tf.float32),
                                                axis=0)
        self.test_answer_mask = 1.0 - self.train_answer_mask
        self.obj_answer_mask = tf.expand_dims(tf.constant(
            self.answer_dict['is_object'], dtype=tf.float32),
                                              axis=0)
        self.attr_answer_mask = tf.expand_dims(tf.constant(
            self.answer_dict['is_attribute'], dtype=tf.float32),
                                               axis=0)

        self.glove_map = modules.LearnGloVe(self.vocab)
        self.answer_exist_mask = modules.AnswerExistMask(
            self.answer_dict, self.word_weight_dir)

        if self.config.debug:
            self.features, self.spatials, self.normal_boxes, self.num_boxes, \
                self.max_box_num, self.vfeat_dim = get_dummy_data()
        elif image_features is None:
            log.infov('loading image features...')
            with h5py.File(config.vfeat_path, 'r') as f:
                self.features = np.array(f.get('image_features'))
                log.infov('feature done')
                self.spatials = np.array(f.get('spatial_features'))
                log.infov('spatials done')
                self.normal_boxes = np.array(f.get('normal_boxes'))
                log.infov('normal_boxes done')
                self.num_boxes = np.array(f.get('num_boxes'))
                log.infov('num_boxes done')
                self.max_box_num = int(f['data_info']['max_box_num'].value)
                self.vfeat_dim = int(f['data_info']['vfeat_dim'].value)
            log.infov('done')
        else:
            self.features = image_features['features']
            self.spatials = image_features['spatials']
            self.normal_boxes = image_features['normal_boxes']
            self.num_boxes = image_features['num_boxes']
            self.max_box_num = image_features['max_box_num']
            self.vfeat_dim = image_features['vfeat_dim']

        self.build()
    def _build_task_termination(self):
        """
        Build task-specific nodes, losses, and optimizers.
        """
        logger = logging.getLogger("%s.Network._build_task_termination" %
                                   self.config.name)
        logger.debug("Building task termination")

        for task in self.config.tasks:
            input_layer = self._shared_layers_output[task.name]
            logger.debug(
                "Building task termination for task %s on top of shared layers",
                task.name)
            logger.debug("Building %d hidden layers", len(task.hidden_layers))

            for idx, hidden_layer in enumerate(task.hidden_layers):
                assert isinstance(hidden_layer, HiddenLayerConfig)
                logger.debug(
                    "Building %d. hidden layer with %d units and activation %s",
                    idx + 1, hidden_layer.units, hidden_layer.activation)

                input_layer = tf.compat.v1.layers.dense(
                    input_layer,
                    hidden_layer.units,
                    activation=ACTIVATION_MAPPING[hidden_layer.activation],
                    name="hidden_layer-%s-%d" % (task.name, idx + 1))

            input_layer = tf.nn.dropout(input_layer,
                                        1 - (task.dropout_keep_probability))

            # Projection for prediction
            num_classes = len(task.data_reader.get_labels())
            logger.debug(
                "Build projection layer to map network output to classes. There are %d classes",
                num_classes)

            self._projections[task.name] = tf.compat.v1.layers.dense(
                input_layer,
                num_classes,
                name="projection_layer-%s" % task.name)

            # Loss and prediction
            logger.debug("Attaching classifier")
            if task.classifier == CLASSIFIER_CRF:
                # CRF
                logger.debug("CRF classifier")
                # Prediction is performed via Viterbi decoding -> no prediction layer necessary
                self._predictions[task.name] = None
                with tf.compat.v1.variable_scope("crf_log_likelihood_%s" %
                                                 task.name):
                    log_likelihood, self._transition_params[
                        task.name] = tfa.text.crf_log_likelihood(
                            self._projections[task.name],
                            self._inputs_label[task.name],
                            self._input_sequence_length)
                self._losses[task.name] = tf.reduce_mean(
                    input_tensor=-log_likelihood)
            else:
                # Softmax
                logger.debug("Softmax classifier")
                self._predictions[task.name] = tf.cast(
                    tf.argmax(input=self._projections[task.name], axis=-1),
                    tf.int32)
                # Transition params are not required for softmax
                self._transition_params[task.name] = None

                labels = tf.one_hot(self._inputs_label[task.name],
                                    len(task.data_reader.get_labels()))

                # NOTE: this is for testing soft-label capability only (should be disabled!)
                # labels = tf.multiply(labels, 10.0)  # Multiply with 10 so that true label has a higher weight
                # labels = tf.add(labels, 1.0)  # Add one so that multiplication with random values has effect
                # noise = tf.random_uniform(
                #     tf.shape(labels)
                # )
                # labels = tf.multiply(labels, noise)  # Element-wise multiplication with noise
                # labels = tf.nn.softmax(labels)  # Perform softmax to restore the valid probability distribution

                losses = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self._projections[task.name],
                    labels=tf.stop_gradient(labels),
                    name="softmax_%s" % task.name)
                # Add Mask for padded sentences
                mask = tf.sequence_mask(self._input_sequence_length,
                                        name="softmax_mask_%s" % task.name)
                losses = tf.boolean_mask(tensor=losses,
                                         mask=mask,
                                         name="softmax_mask_layer_%s" %
                                         task.name)
                self._losses[task.name] = tf.reduce_mean(input_tensor=losses)

            # Optimizer
            logger.debug("Attaching optimizer")
            optimizer_function = OPTIMIZER_MAPPING[
                self.config.training.optimizer]
            optimizer = optimizer_function(
                **self.config.training.optimizer_params)

            gradients, variables = list(
                zip(*optimizer.compute_gradients(self._losses[task.name])))

            if self.config.training.use_gradient_clipping:
                logger.debug(
                    "Adding node for performing gradient clipping for task %s.",
                    task.name)
                gradients, self._gradient_norms[
                    task.name] = tf.clip_by_global_norm(
                        gradients, self.config.training.clip_norm)
            else:
                self._gradient_norms[task.name] = tf.linalg.global_norm(
                    gradients)

            self._operations_train[task.name] = optimizer.apply_gradients(
                list(zip(gradients, variables)))
Esempio n. 41
0
    def __init__(self, word_vocab_enc, word_vocab_dec, options=None, mode='ce_train'):
        # here 'mode', whose value can be:
        #  'ce_train',
        #  'rl_train',
        #  'evaluate',
        #  'evaluate_bleu',
        #  'decode'.
        # it is different from 'mode_gen' in generator_utils.py
        # value of 'mode_gen' can be ['ce_loss', 'rl_loss', 'greedy' or 'sample']
        self.mode = mode

        # is_training controls whether to use dropout
        is_training = True if mode in ('ce_train', ) else False

        self.options = options
        self.word_vocab_enc = word_vocab_enc
        self.word_vocab_dec = word_vocab_dec

        self.create_placeholders(options)

        # encode the input instance
        # encoder.graph_hidden [batch, node_num, vsize]
        # encoder.graph_cell [batch, node_num, vsize]
        with tf.variable_scope('linamr_encoder'):
            self.linamr_encoder = encoder_utils.SeqEncoder(options,
                    word_vocab = word_vocab_enc)
            self.linamr_hidden_dim, self.linamr_hiddens, self.linamr_decinit = \
                    self.linamr_encoder.encode(is_training=is_training)
            self.linamr_words = self.linamr_encoder.in_passage_words
            self.linamr_lengths = self.linamr_encoder.passage_lengths
            self.linamr_mask = self.linamr_encoder.passage_mask

        with tf.variable_scope('src_encoder'):
            self.src_encoder = encoder_utils.SeqEncoder(options,
                    word_vocab=word_vocab_enc)
            self.src_hidden_dim, self.src_hiddens, self.src_decinit = \
                    self.src_encoder.encode(is_training=is_training)
            self.src_words = self.src_encoder.in_passage_words
            self.src_lengths = self.src_encoder.passage_lengths
            self.src_mask = self.src_encoder.passage_mask

        # ============== Choices of initializing decoder state =============
        if options.way_init_decoder == 'src':
            new_c, new_h = self.src_decinit.c, self.src_decinit.h
        elif options.way_init_decoder == 'linamr':
            new_c, new_h = self.linamr_decinit.c, self.linamr_decinit.h
        elif options.way_init_decoder == 'zero':
            new_c = tf.zeros([self.encoder.batch_size, options.gen_hidden_size])
            new_h = tf.zeros([self.encoder.batch_size, options.gen_hidden_size])
        else:
            assert False, 'way to initial decoder (%s) not supported' % options.way_init_decoder
        self.init_decoder_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h)

        # prepare src-side input for decoder

        loss_weights = tf.sequence_mask(self.answer_len, options.max_answer_len, dtype=tf.float32) # [batch_size, gen_steps]

        with variable_scope.variable_scope("generator"):
            # create generator
            self.generator = generator_utils.CovAttenGen(self, options, word_vocab_dec, is_training=is_training)
            # calculate encoder_features
            with variable_scope.variable_scope("encoder_feats"):
                self.linamr_features = self.generator.calculate_encoder_features(
                        self.linamr_hiddens, self.linamr_hidden_dim)

            with variable_scope.variable_scope("src_feats"):
                self.src_features = self.generator.calculate_encoder_features(
                        self.src_hiddens, self.src_hidden_dim)

            if mode == 'decode':
                self.context_encoder_t_1 = tf.placeholder(tf.float32,
                        [None, self.linamr_hidden_dim], name='context_encoder_t_1') # [batch_size, encoder_dim]
                self.context_src_t_1 = tf.placeholder(tf.float32,
                        [None, self.src_hidden_dim], name='context_src_t_1') # [batch_size, src_dim]
                if options.use_coverage:
                    self.coverage_t_1 = tf.placeholder(tf.float32, [None, None], name='coverage_t_1') # [batch_size, encoder_dim]
                else:
                    self.coverage_t_1 = None
                self.word_t = tf.placeholder(tf.int32, [None], name='word_t') # [batch_size]

                (self.state_t, self.context_encoder_t, self.context_src_t,
                        self.coverage_t, self.attn_dist_t, self.ouput_t,
                        self.topk_log_probs, self.topk_ids, self.greedy_prediction, self.multinomial_prediction) = \
                            self.generator.decode_mode(
                        word_vocab_dec, options.beam_size, self.init_decoder_state,
                        self.context_encoder_t_1, self.context_src_t_1, self.coverage_t_1, self.word_t,
                        self.linamr_hiddens, self.linamr_features, self.linamr_mask,
                        self.src_hiddens, self.src_features, self.src_mask)
                # not buiding training op for this mode
                return
            elif mode == 'evaluate_bleu':
                _, _, self.greedy_words = self.generator.train_mode(word_vocab_dec,
                    self.linamr_hidden_dim, self.linamr_hiddens, self.linamr_features, self.linamr_mask,
                    self.src_hidden_dim, self.src_hiddens, self.src_features, self.src_mask,
                    self.init_decoder_state, self.answer_inp, self.answer_ref, loss_weights, mode_gen='greedy')
                # not buiding training op for this mode
                return
            elif mode in ('ce_train', 'evaluate', ):
                self.accu, self.loss, _ = self.generator.train_mode(word_vocab_dec,
                    self.linamr_hidden_dim, self.linamr_hiddens, self.linamr_features, self.linamr_mask,
                    self.src_hidden_dim, self.src_hiddens, self.src_features, self.src_mask,
                    self.init_decoder_state, self.answer_inp, self.answer_ref, loss_weights, mode_gen='ce_loss')
                if mode == 'evaluate': return # not buiding training op for evaluation

        with tf.device('/gpu:1'):
            if options.optimize_type == 'adadelta':
                optimizer = tf.train.AdadeltaOptimizer(learning_rate=options.learning_rate)
            elif options.optimize_type == 'adam':
                optimizer = tf.train.AdamOptimizer(learning_rate=options.learning_rate)
            clipper = 50 if not options.__dict__.has_key("max_gradient_norm") else options.max_gradient_norm
            print("MAX gradient norm {}".format(clipper))
            tvars = tf.trainable_variables()
            if options.lambda_l2>0.0:
                l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
                self.loss = self.loss + options.lambda_l2 * l2_loss
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars))

            extra_train_ops = []
            train_ops = [self.train_op] + extra_train_ops
            self.train_op = tf.group(*train_ops)
Esempio n. 42
0
    def add_decoder_op(self):
        # reshape inputs to a list of words
        input_mask = tf.sequence_mask(self.sequence_lengths)
        encoder_output_h, encoder_output_c = self.encoder_output
        decoder_input_h = tf.boolean_mask(encoder_output_h, input_mask)
        decoder_input_c = tf.boolean_mask(encoder_output_c, input_mask)
        initial_state = tf.contrib.rnn.LSTMStateTuple(h=decoder_input_h,
                                                      c=decoder_input_c)

        batch_size = tf.shape(decoder_input_h)[0]
        projection_layer = tf.layers.Dense(self.config.ntags,
                                           use_bias=True,
                                           name="decoder_proj")

        decoder_cell = tf.contrib.rnn.LSTMCell(
            num_units=2 * self.config.hidden_size_lstm
        )  # num_units = encoder backword and forward hidden states concatenated

        if (self.config.analysis_embeddings == "attention_tag"
                or self.config.analysis_embeddings == "attention_category"):
            self.logger.warning("Using attention %s" %
                                self.config.analysis_embeddings)
            # shape: [words X analysis-number X attention-embedding-size]
            analysis_attention_embeddings = tf.boolean_mask(
                self.analysis_attention_embeddings, input_mask)
            analysis_lengths = tf.boolean_mask(self.analysis_lengths,
                                               input_mask)
            # shape: [words]

            if self.config.attention_mechanism == 'luong':
                attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                    num_units=2 * self.config.hidden_size_lstm,
                    memory=analysis_attention_embeddings,
                    memory_sequence_length=analysis_lengths,
                    scale=False)
            elif self.config.attention_mechanism == 'bahdanau':
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    num_units=2 * self.config.hidden_size_lstm,
                    memory=analysis_attention_embeddings,
                    memory_sequence_length=analysis_lengths)
            else:
                raise ValueError("Invalid attention mechanism '%s'" %
                                 self.config.attention_mechnism)

            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                decoder_cell,
                attention_mechanism,
                attention_layer_size=2 * self.config.hidden_size_lstm)
            initial_state = decoder_cell.zero_state(
                dtype=tf.float32,
                batch_size=batch_size).clone(cell_state=initial_state)

        start_tokens = tf.tile([self.sos_id], [batch_size])

        # shift tags one step to the left and prepend 'sos' token.
        tag_ids_train = tf.concat(
            [tf.expand_dims(start_tokens, 1), self.tag_ids[:, :-1]], 1)
        tags_train_embedded = tf.nn.embedding_lookup(self.tag_embeddings,
                                                     tag_ids_train)
        tags_train_embedded = tf.layers.dropout(
            tags_train_embedded,
            rate=1 - self.config.tag_embeddings_dropout,
            training=self.training_phase)

        # Training
        if self.config.trainer == "basic":
            train_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=tags_train_embedded,
                sequence_length=self.
                tag_lengths,  # `tag-length` covers <sos-token, actual tags, eos-token>
            )
        elif self.config.trainer == "scheduled":
            train_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
                inputs=tags_train_embedded,
                sequence_length=self.
                tag_lengths,  # `tag-length` covers <sos-token, actual tags, eos-token>
                embedding=lambda ids: tf.nn.embedding_lookup(
                    self.tag_embeddings, ids),
                sampling_probability=self.config.
                scheduled_trainer_sampling_prob)
        else:
            raise ValueError("Invalid trainer specified: '%s'" %
                             self.config.trainer)

        train_decoder = tf.contrib.seq2seq.BasicDecoder(
            decoder_cell,
            train_helper,
            initial_state=initial_state,
            output_layer=projection_layer)

        decoder_outputs, final_state, decoder_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
            train_decoder, impute_finished=False)
        # logits = decoder_outputs.rnn_output
        logits = decoder_outputs[0]
        logits = tf.verify_tensor_all_finite(logits, "Logits not finite")

        # from padded training tags extracts actual-tags + eos-token:
        weights = tf.to_float(tf.not_equal(tag_ids_train, self.eos_id))
        weights = tf.to_float(tf.not_equal(weights, self.pad_id))
        loss = tf.contrib.seq2seq.sequence_loss(logits=logits,
                                                targets=self.tag_ids,
                                                weights=weights,
                                                name="sequence_loss",
                                                average_across_timesteps=False)
        self.loss = tf.reduce_sum(loss)

        # Scoring

        # 1. Score given labels
        scoring_helper = tf.contrib.seq2seq.TrainingHelper(
            inputs=tags_train_embedded, sequence_length=self.tag_lengths)
        scoring_decoder = tf.contrib.seq2seq.BasicDecoder(
            decoder_cell,
            scoring_helper,
            initial_state=initial_state,
            output_layer=projection_layer)
        scoring_outputs, _, scoring_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
            scoring_decoder)
        scoring_logits = scoring_outputs.rnn_output
        scoring_logits = tf.verify_tensor_all_finite(
            scoring_logits, "Scoring logits not finite")
        logits_flat = tf.reshape(scoring_logits,
                                 [-1, tf.shape(scoring_logits)[2]])
        softmax_scores_flat = tf.nn.softmax(logits_flat, dim=-1)
        tag_ids_train_flat = tf.reshape(self.tag_ids, [-1])
        indices = tf.concat([
            tf.expand_dims(tf.range(0,
                                    tf.shape(tag_ids_train_flat)[0]), 1),
            tf.expand_dims(tag_ids_train_flat, 1)
        ],
                            axis=1)
        tag_softmax_scores_flat = tf.gather_nd(softmax_scores_flat, indices)
        tag_softmax_scores = tf.reshape(tag_softmax_scores_flat,
                                        [batch_size, -1])
        tag_mask = tf.sequence_mask(self.tag_lengths,
                                    tf.shape(tag_softmax_scores)[1])
        tag_softmax_scores = tf.multiply(tag_softmax_scores,
                                         tf.cast(tag_mask, tf.float32))
        tag_softmax_scores += tf.cast(tf.logical_not(tag_mask), tf.float32)
        scores = np.e**-tf.div(
            tf.reduce_sum(tf.log(tag_softmax_scores), axis=-1),
            tf.cast(self.tag_lengths, tf.float32))
        self.labels_scores = scores

        # 2. Score best labels
        max_tag_softmax_scores = tf.reduce_max(tf.nn.softmax(scoring_logits,
                                                             dim=-1),
                                               axis=-1)
        max_tag_mask = tf.sequence_mask(self.tag_lengths,
                                        tf.shape(max_tag_softmax_scores)[1])
        max_tag_softmax_scores = tf.multiply(max_tag_softmax_scores,
                                             tf.cast(max_tag_mask, tf.float32))
        max_tag_softmax_scores += tf.cast(tf.logical_not(max_tag_mask),
                                          tf.float32)
        max_scores = np.e**-tf.div(
            tf.reduce_sum(tf.log(max_tag_softmax_scores), axis=-1),
            tf.cast(self.tag_lengths, tf.float32))
        self.labels_max_scores = max_scores
        self.labels_max_ids = tf.argmax(scoring_logits, axis=-1)

        # Inference
        infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
            embedding=self.tag_embeddings,
            start_tokens=start_tokens,
            end_token=self.eos_id)

        infer_decoder = tf.contrib.seq2seq.BasicDecoder(
            decoder_cell,
            infer_helper,
            initial_state=initial_state,
            output_layer=projection_layer)

        final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
            infer_decoder,
            maximum_iterations=self.config.decoder_maximum_iterations,
            impute_finished=True)

        decoder_logits = final_outputs.rnn_output
        decoder_logits = tf.verify_tensor_all_finite(
            decoder_logits, "Decoder Logits not finite")
        with tf.control_dependencies([
                tf.assert_rank(decoder_logits, 3),
                tf.assert_none_equal(tf.reduce_sum(decoder_logits), 0.),
                tf.assert_equal(
                    tf.cast(tf.argmax(decoder_logits, axis=-1), tf.int32),
                    final_outputs.sample_id)
        ]):
            decoder_logits = tf.identity(decoder_logits)

        self.decoder_logits = decoder_logits
        self.labels_pred = final_outputs.sample_id
        self.labels_pred_lengths = final_sequence_lengths
Esempio n. 43
0
																	  lr, 
																	  target_sequence_length, 
																	  max_target_sequence_length, 
																	  source_sequence_length,
																	  len(da.source_letter_to_int),
																	  len(da.target_letter_to_int),
																	  encoding_embedding_size, 
																	  decoding_embedding_size, 
																	  rnn_size, 
																	  num_layers,
																	  batch_size)	
	
	training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
	predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions')
	
	masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')

	with tf.name_scope("optimization"):
		
		# Loss function
		cost = tf.contrib.seq2seq.sequence_loss(
			training_logits,
			targets,
			masks)

		# Optimizer
		optimizer = tf.train.AdamOptimizer(lr)

		# Gradient Clipping
		gradients = optimizer.compute_gradients(cost)
		capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
Esempio n. 44
0
    def build_training_graph(self, input_tensors):
        target_index = input_tensors[reader.TARGET_INDEX_KEY]
        target_lengths = input_tensors[reader.TARGET_LENGTH_KEY]
        path_source_indices = input_tensors[reader.PATH_SOURCE_INDICES_KEY]
        node_indices = input_tensors[reader.NODE_INDICES_KEY]
        path_target_indices = input_tensors[reader.PATH_TARGET_INDICES_KEY]
        valid_context_mask = input_tensors[reader.VALID_CONTEXT_MASK_KEY]
        path_source_lengths = input_tensors[reader.PATH_SOURCE_LENGTHS_KEY]
        path_lengths = input_tensors[reader.PATH_LENGTHS_KEY]
        path_target_lengths = input_tensors[reader.PATH_TARGET_LENGTHS_KEY]

        with tf.variable_scope('model'):
            subtoken_vocab = tf.get_variable('SUBTOKENS_VOCAB',
                                             shape=(self.subtoken_vocab_size, self.config.EMBEDDINGS_SIZE),
                                             dtype=tf.float32,
                                             initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0,
                                                                                                        mode='FAN_OUT',
                                                                                                        uniform=True))
            target_words_vocab = tf.get_variable('TARGET_WORDS_VOCAB',
                                                 shape=(self.target_vocab_size, self.config.EMBEDDINGS_SIZE),
                                                 dtype=tf.float32,
                                                 initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0,
                                                                                                            mode='FAN_OUT',
                                                                                                            uniform=True))
            nodes_vocab = tf.get_variable('NODES_VOCAB', shape=(self.nodes_vocab_size, self.config.EMBEDDINGS_SIZE),
                                          dtype=tf.float32,
                                          initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0,
                                                                                                     mode='FAN_OUT',
                                                                                                     uniform=True))
            # (batch, max_contexts, decoder_size)
            batched_contexts = self.compute_contexts(subtoken_vocab=subtoken_vocab, nodes_vocab=nodes_vocab,
                                                     source_input=path_source_indices, nodes_input=node_indices,
                                                     target_input=path_target_indices,
                                                     valid_mask=valid_context_mask,
                                                     path_source_lengths=path_source_lengths,
                                                     path_lengths=path_lengths, path_target_lengths=path_target_lengths)

            batch_size = tf.shape(target_index)[0]
            outputs, final_states = self.decode_outputs(target_words_vocab=target_words_vocab,
                                                        target_input=target_index, batch_size=batch_size,
                                                        batched_contexts=batched_contexts,
                                                        valid_mask=valid_context_mask)
            step = tf.Variable(0, trainable=False)

            logits = outputs.rnn_output  # (batch, max_output_length, dim * 2 + rnn_size)

            crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_index, logits=logits)
            target_words_nonzero = tf.sequence_mask(target_lengths + 1,
                                                    maxlen=self.config.MAX_TARGET_PARTS + 1, dtype=tf.float32)
            loss = tf.reduce_sum(crossent * target_words_nonzero) / tf.to_float(batch_size)

            if self.config.USE_MOMENTUM:
                learning_rate = tf.train.exponential_decay(0.01, step * self.config.BATCH_SIZE,
                                                           self.num_training_examples,
                                                           0.95, staircase=True)
                optimizer = tf.train.MomentumOptimizer(learning_rate, 0.95, use_nesterov=True)
                train_op = optimizer.minimize(loss, global_step=step)
            else:
                params = tf.trainable_variables()
                gradients = tf.gradients(loss, params)
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, clip_norm=5)
                optimizer = tf.train.AdamOptimizer()
                train_op = optimizer.apply_gradients(zip(clipped_gradients, params))

            self.saver = tf.train.Saver(max_to_keep=10)

        return train_op, loss
Esempio n. 45
0
def train_attention_modified():
    # 构造graph
    train_graph = tf.Graph()
    with train_graph.as_default():
        # 获得模型输入
        input_keywords_ids, input_pretexts_ids, targets, lr, target_sequence_length, max_target_sequence_length, \
        input_keywords_length, input_pretexts_length = get_inputs_modified()
        training_decoder_output, predict_output = seq2seq_model_modified(
            input_keywords_ids, input_pretexts_ids, targets, lr,
            target_sequence_length, max_target_sequence_length,
            input_keywords_length, input_pretexts_length, len(word2id),
            len(word2id), encoding_embedding_size, decoding_embedding_size,
            rnn_size, num_layers)
        training_logits = tf.identity(training_decoder_output.rnn_output,
                                      'logits')
        predicting_logits = tf.identity(predict_output.sample_id,
                                        name='predictions')
        masks = tf.sequence_mask(target_sequence_length,
                                 max_target_sequence_length,
                                 dtype=tf.float32,
                                 name='masks')
        with tf.name_scope("optimization"):
            # Loss function
            cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets,
                                                    masks)
            # Optimizer
            optimizer = tf.train.AdamOptimizer(lr)
            # Gradient Clipping
            gradients = optimizer.compute_gradients(cost)
            capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var)
                                for grad, var in gradients if grad is not None]
            train_op = optimizer.apply_gradients(capped_gradients)
    # 将数据集分割为train和validation
    train_keywords = keywords_int[300 * batch_size:]
    train_pretexts = pretexts_int[300 * batch_size:]
    train_target = curlines_int[300 * batch_size:]
    # 留出一个batch进行验证
    valid_keywords = keywords_int[:300 * batch_size]
    valid_pretexts = pretexts_int[:300 * batch_size]
    valid_target = curlines_int[:300 * batch_size]
    (valid_targets_batch, valid_keywords_batch, valid_pretexts_batch,
     valid_targets_lengths, valid_keywords_lengths,
     valid_pretexts_length) = next(
         getbatches_modified(valid_target, valid_keywords, valid_pretexts,
                             batch_size, word2id['<PAD>']))
    display_step = 50  # 每隔50轮输出loss
    checkpoint = "./model/trained_model_attention.ckpt"
    checkpoint_path = './model/trained_model_attention_qijue_epoch'
    with tf.Session(graph=train_graph) as sess:
        sess.run(tf.global_variables_initializer())
        for epoch_i in range(1, epochs + 1):
            for batch_i, (targets_batch, keywords_batch, pretexts_batch,
                          targets_lengths, batch_keywords_lengths,
                          batch_pretexts_lengths) in enumerate(
                              getbatches_modified(train_target, train_keywords,
                                                  train_pretexts, batch_size,
                                                  word2id['<PAD>'])):
                _, loss = sess.run(
                    [train_op, cost], {
                        input_keywords_ids: keywords_batch,
                        input_pretexts_ids: pretexts_batch,
                        targets: targets_batch,
                        lr: learning_rate,
                        target_sequence_length: targets_lengths,
                        input_pretexts_length: batch_pretexts_lengths,
                        input_keywords_length: batch_keywords_lengths
                    })

                if batch_i % display_step == 0:
                    # 计算validation loss
                    validation_loss = sess.run(
                        [cost], {
                            input_keywords_ids: valid_keywords_batch,
                            input_pretexts_ids: valid_pretexts_batch,
                            targets: valid_targets_batch,
                            lr: learning_rate,
                            target_sequence_length: valid_targets_lengths,
                            input_keywords_length: valid_keywords_lengths,
                            input_pretexts_length: valid_pretexts_length
                        })

                    print(
                        'Epoch {:>3}/{} Batch {:>4}/{} - Training Loss: {:>6.3f}  - Validation loss: {:>6.3f}'
                        .format(epoch_i, epochs, batch_i,
                                len(train_target) // batch_size, loss,
                                validation_loss[0]))
            checkpoint = checkpoint_path + str(epoch_i) + '.ckpt'
            saver = tf.train.Saver()
            saver.save(sess, checkpoint)
            print('Model Trained and Saved')
Esempio n. 46
0
  def score(self, features_file, predictions_file, checkpoint_path=None, output_file=None):
    """Scores existing predictions.

    Args:
      features_file: The input file.
      predictions_file: The predictions file to score.
      checkpoint_path: Path of a specific checkpoint to use. If ``None``,
        the latest is used.
      output_file: The file where the scores are saved. Otherwise, they will be
        printed on the standard output.

    Raises:
      ValueError: if no checkpoint are found or if the model is not a sequence to
        sequence model.
    """
    if not isinstance(self._model, (models.LanguageModel, models.SequenceToSequence)):
      raise ValueError("scoring only works for sequence to sequence or language models")

    if checkpoint_path is None:
      checkpoint_path = tf.train.latest_checkpoint(self._config["model_dir"])
    elif tf.gfile.IsDirectory(checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
    if checkpoint_path is None:
      raise ValueError("could not find a trained model in %s" % self._config["model_dir"])

    model = copy.deepcopy(self._model)
    with tf.Graph().as_default():
      dataset = model.examples_inputter.make_evaluation_dataset(
          features_file,
          predictions_file,
          self._config["score"]["batch_size"],
          num_threads=self._config["score"].get("num_threads"),
          prefetch_buffer_size=self._config["score"].get("prefetch_buffer_size"))
      iterator = dataset.make_initializable_iterator()
      features, labels = iterator.get_next()
      labels["alignment"] = None  # Add alignment key to force the model to return attention.
      outputs, _ = model(
          features,
          labels,
          self._config["params"],
          tf.estimator.ModeKeys.EVAL)

      cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
          logits=outputs["logits"], labels=labels["ids_out"])
      weights = tf.sequence_mask(labels["length"], dtype=cross_entropy.dtype)
      masked_cross_entropy = cross_entropy * weights
      scores = tf.reduce_sum(masked_cross_entropy, axis=1)
      results = {
          "cross_entropy": cross_entropy,
          "score": scores,
          "tokens": labels["tokens"],
          "length": labels["length"] - 1  # -1 for the special token.
      }
      if "attention" in outputs:
        results["attention"] = outputs["attention"]

      if output_file:
        stream = io.open(output_file, encoding="utf-8", mode="w")
      else:
        stream = sys.stdout

      output_tokenizer = (
          self._model.labels_inputter.tokenizer if not self._model.unsupervised
          else self._model.features_inputter.tokenizer)
      with tf.train.MonitoredSession(
          session_creator=tf.train.ChiefSessionCreator(
              checkpoint_filename_with_path=checkpoint_path,
              config=self._session_config)) as sess:
        sess.run(iterator.initializer)
        while not sess.should_stop():
          for batch in misc.extract_batches(sess.run(results)):
            tokens = batch["tokens"][:batch["length"]]
            sentence = output_tokenizer.detokenize(tokens)
            token_level_scores = None
            attention = None
            if self._config["score"].get("with_token_level"):
              token_level_scores = batch["cross_entropy"][:batch["length"]]
            if "attention" in batch:
              attention = batch["attention"][:batch["length"]]
            alignment_type = self._config["score"].get("with_alignments")
            sentence = format_translation_output(
                sentence,
                score=batch["score"],
                token_level_scores=token_level_scores,
                attention=attention,
                alignment_type=alignment_type)
            misc.print_bytes(tf.compat.as_bytes(sentence), stream=stream)

      if output_file:
        stream.close()
Esempio n. 47
0
    def call(self, inputs, mask=None, training=None, **kwargs):

        if self.supports_masking:
            queries, keys = inputs
            query_masks, key_masks = mask
            query_masks = tf.cast(query_masks, tf.float32)
            key_masks = tf.cast(key_masks, tf.float32)
        else:
            queries, keys, query_masks, key_masks = inputs

            query_masks = tf.sequence_mask(query_masks,
                                           self.seq_len_max,
                                           dtype=tf.float32)
            key_masks = tf.sequence_mask(key_masks,
                                         self.seq_len_max,
                                         dtype=tf.float32)
            query_masks = tf.squeeze(query_masks, axis=1)
            key_masks = tf.squeeze(key_masks, axis=1)

        if self.use_positional_encoding:
            queries = positional_encoding(queries)
            keys = positional_encoding(queries)

        querys = tf.tensordot(queries, self.W_Query,
                              axes=(-1, 0))  # None T_q D*head_num
        keys = tf.tensordot(keys, self.W_key, axes=(-1, 0))
        values = tf.tensordot(keys, self.W_Value, axes=(-1, 0))

        # head_num*None T_q D
        querys = tf.concat(tf.split(querys, self.head_num, axis=2), axis=0)
        keys = tf.concat(tf.split(keys, self.head_num, axis=2), axis=0)
        values = tf.concat(tf.split(values, self.head_num, axis=2), axis=0)

        # head_num*None T_q T_k
        outputs = tf.matmul(querys, keys, transpose_b=True)

        outputs = outputs / (keys.get_shape().as_list()[-1]**0.5)

        key_masks = tf.tile(key_masks, [self.head_num, 1])

        # (h*N, T_q, T_k)
        key_masks = tf.tile(tf.expand_dims(key_masks, 1),
                            [1, tf.shape(queries)[1], 1])

        paddings = tf.ones_like(outputs) * (-2**32 + 1)

        # (h*N, T_q, T_k)

        outputs = tf.where(
            tf.equal(key_masks, 1),
            outputs,
            paddings,
        )
        if self.blinding:
            outputs = tf.matrix_set_diag(
                outputs,
                tf.ones_like(outputs)[:, :, 0] * (-2**32 + 1))

        outputs -= tf.reduce_max(outputs, axis=-1, keep_dims=True)
        outputs = tf.nn.softmax(outputs)
        query_masks = tf.tile(query_masks, [self.head_num, 1])  # (h*N, T_q)
        # (h*N, T_q, T_k)
        query_masks = tf.tile(tf.expand_dims(query_masks, -1),
                              [1, 1, tf.shape(keys)[1]])

        outputs *= query_masks

        outputs = self.dropout(outputs, training=training)
        # Weighted sum
        # ( h*N, T_q, C/h)
        result = tf.matmul(outputs, values)
        result = tf.concat(tf.split(result, self.head_num, axis=0), axis=2)

        if self.use_res:
            # tf.tensordot(queries, self.W_Res, axes=(-1, 0))
            result += queries
        if self.use_layer_norm:
            result = self.ln(result)

        if self.use_feed_forward:
            fw1 = tf.nn.relu(tf.tensordot(result, self.fw1, axes=[-1, 0]))
            fw1 = self.dropout(fw1, training=training)
            fw2 = tf.tensordot(fw1, self.fw2, axes=[-1, 0])
            if self.use_res:
                result += fw2
            if self.use_layer_norm:
                result = self.ln(result)

        return tf.reduce_mean(result, axis=1, keep_dims=True)
Esempio n. 48
0
    def _create_network(self):
        self.X = tf.placeholder(tf.int32, [self.batch_size, None],
                                name="X")  # input smiles
        self.Y = tf.placeholder(tf.int32, [self.batch_size, None],
                                name="Y")  # reconstructed smiles
        self.S = tf.placeholder(tf.float32,
                                [self.batch_size, self.sample_size],
                                name="S")  # seed
        self.L = tf.placeholder(tf.int32, [self.batch_size],
                                "L")  # actual length of SMILES
        self.N = tf.placeholder(tf.float32,
                                [self.batch_size, self.latent_size],
                                "N")  # randomness on latent vectors
        self.P = tf.placeholder(tf.float32,
                                [self.batch_size, self.property_task],
                                "P")  # properties
        mol_onehot = tf.one_hot(tf.cast(self.X, tf.int32), self.vocab_size)
        mol_onehot = tf.cast(mol_onehot, tf.float32)
        self.prefn = [self.latent_size, self.latent_size, self.property_task]
        self.disfn = [self.latent_size, self.latent_size, 1]
        self.genfn = [self.latent_size, self.latent_size, self.latent_size]

        decoded_rnn_size = [self.latent_size]
        encoded_rnn_size = [self.latent_size]
        with tf.variable_scope('decode'):
            decode_cell = []
            for i in decoded_rnn_size[:]:
                decode_cell.append(tf.nn.rnn_cell.LSTMCell(i))
            self.decoder = tf.nn.rnn_cell.MultiRNNCell(decode_cell)

        with tf.variable_scope('encode'):
            encode_cell = []
            for i in encoded_rnn_size[:]:
                encode_cell.append(tf.nn.rnn_cell.LSTMCell(i))
            self.encoder = tf.nn.rnn_cell.MultiRNNCell(encode_cell)
        self.initial_state = self.decoder.zero_state(self.batch_size,
                                                     tf.float32)

        self.weights = {}
        self.biases = {}

        self.weights['softmax'] = tf.get_variable("softmaxw", initializer=tf.contrib.layers.xavier_initializer(),\
                                  shape=[decoded_rnn_size[-1], self.vocab_size])
        self.biases['softmax'] = tf.get_variable(
            "softmaxb",
            initializer=tf.contrib.layers.xavier_initializer(),
            shape=[self.vocab_size])

        for i in range(len(self.disfn)):
            name = 'disfw' + str(i + 1)
            if i == 0:
                self.weights[name] =  tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(),\
                                  shape=[self.latent_size, self.disfn[i]])
            else:
                self.weights[name] =  tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(),\
                                  shape=[self.disfn[i-1], self.disfn[i]])
            name = 'disfb' + str(i + 1)
            self.biases[name] = tf.get_variable(
                name,
                initializer=tf.zeros_initializer(),
                shape=[self.disfn[i]])

        for i in range(len(self.prefn)):
            name = 'clyfw' + str(i + 1)
            if i == 0:
                self.weights[name] =  tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(),\
                                  shape=[self.latent_size, self.prefn[i]])
            else:
                self.weights[name] =  tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(),\
                                  shape=[self.prefn[i-1], self.prefn[i]])
            name = 'clyfb' + str(i + 1)
            self.biases[name] = tf.get_variable(
                name,
                initializer=tf.zeros_initializer(),
                shape=[self.prefn[i]])

        for i in range(len(self.genfn)):
            name = 'genfw' + str(i + 1)
            if i == 0:
                self.weights[name] =  tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(),\
                                  shape=[self.sample_size, self.genfn[i]])
            else:
                self.weights[name] =  tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(),\
                                  shape=[self.genfn[i-1], self.genfn[i]])
            name = 'genfb' + str(i + 1)
            self.biases[name] = tf.get_variable(
                name,
                initializer=tf.zeros_initializer(),
                shape=[self.genfn[i]])

        self.mol_encoded0 = self.total_encoder(mol_onehot)

        self.mol_encoded = tf.nn.l2_normalize(self.mol_encoded0, dim=-1)
        self.latent_vector = self.generator(self.S)
        d_real_logits = self.discriminator(self.mol_encoded)
        d_fake_logits = self.discriminator(self.latent_vector, reuse=True)

        predicted_property = self.predictor(self.mol_encoded)

        self.mol_encoded += self.N

        self.mol_decoded_softmax, mol_decoded_logits = self.total_decoder(
            self.mol_encoded, mol_onehot, self.P)

        weights = tf.sequence_mask(self.L, tf.shape(self.X)[1])
        weights = tf.cast(weights, tf.int32)
        weights = tf.cast(weights, tf.float32)
        self.reconstr_loss = tf.reduce_mean(
            tf.contrib.seq2seq.sequence_loss(logits=mol_decoded_logits,
                                             targets=self.Y,
                                             weights=weights))

        self.g_loss = -tf.reduce_mean(d_fake_logits)
        self.en_loss = (tf.reduce_mean(d_real_logits))

        self.d_loss = (-tf.reduce_mean(d_real_logits) +
                       tf.reduce_mean(d_fake_logits))

        self.en_classified_loss = -tf.reduce_mean(
            tf.square(predicted_property - self.P))  # need to be modified
        self.classified_loss = tf.reduce_mean(
            tf.square(predicted_property - self.P))  # need to be modified

        # Loss
        self.lr = tf.Variable(0.0, trainable=False)

        tvars = tf.trainable_variables()
        ae_list = [
            var for var in tvars if 'decode' in var.name
            or 'encode' in var.name or 'softmax' in var.name
        ]
        en_list = [var for var in tvars if 'encode' in var.name]
        gen_list = [var for var in tvars if 'gen' in var.name]
        dis_list = [var for var in tvars if 'dis' in var.name]
        pre_list = [var for var in tvars if 'cly' in var.name]

        print(np.sum([np.prod(v.shape) for v in ae_list]))
        print(np.sum([np.prod(v.shape) for v in en_list]))
        print(np.sum([np.prod(v.shape) for v in dis_list]))
        print(np.sum([np.prod(v.shape) for v in gen_list]))
        print(np.sum([np.prod(v.shape) for v in pre_list]))
        print(np.sum([np.prod(v.shape) for v in tvars]))
        name1 = [v.name for v in ae_list]
        name2 = [v.name for v in en_list]
        name3 = [v.name for v in dis_list]
        name4 = [v.name for v in gen_list]
        name5 = [v.name for v in pre_list]

        optimizer1 = tf.train.GradientDescentOptimizer(1.0)
        optimizer2 = tf.train.AdamOptimizer(1e-5)
        optimizer3 = tf.train.AdamOptimizer(2e-6)
        optimizer4 = tf.train.AdamOptimizer(1e-5)
        self.opt1 = optimizer1.minimize(self.reconstr_loss, var_list=ae_list)
        self.opt2 = optimizer1.minimize(self.en_loss, var_list=en_list)
        #        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        self.opt3 = optimizer2.minimize(self.g_loss, var_list=gen_list)
        self.opt4 = optimizer3.minimize(self.d_loss, var_list=dis_list)
        self.opt5 = optimizer1.minimize(self.en_classified_loss,
                                        var_list=en_list)
        self.opt6 = optimizer1.minimize(self.classified_loss,
                                        var_list=pre_list)
        self.clip_dis = [
            p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in dis_list
        ]

        self.mol_pred = tf.argmax(self.mol_decoded_softmax, axis=2)
        self.sess = tf.Session()

        init = tf.global_variables_initializer()
        self.sess = tf.Session()
        self.sess.run(init)
        self.saver = tf.train.Saver(max_to_keep=None)
        #        tf.train.start_queue_runners(sess=self.sess)
        print("Network Ready")
Esempio n. 49
0
def sequence_sampled_softmax_cross_entropy(targets, train_logits,
                                           decoder_weights, decoder_biases,
                                           num_classes, **loss):
    batch_max_targets_sequence_length = tf.shape(targets)[1]
    targets_sequence_length = sequence_length_2D(tf.cast(targets, tf.int64))
    batch_max_train_logits_sequence_length = tf.shape(train_logits)[1]

    logits_pad_len = tf.maximum(
        0,
        batch_max_targets_sequence_length -
        batch_max_train_logits_sequence_length,
    )
    targets_pad_len = tf.maximum(
        0,
        batch_max_train_logits_sequence_length -
        batch_max_targets_sequence_length,
    )

    padded_logits = tf.pad(train_logits, [[0, 0], [0, logits_pad_len], [0, 0]])
    padded_targets = tf.pad(targets, [[0, 0], [0, targets_pad_len]])

    output_exp = tf.cast(tf.reshape(padded_targets, [-1, 1]), tf.int64)
    sampled_values = sample_values_from_classes(
        output_exp,
        loss["sampler"],
        num_classes,
        loss["negative_samples"],
        loss["unique"],
        loss["class_counts"],
        loss["distortion"],
    )

    if loss["sampler"] == "fixed_unigram":
        # regenerate sampled_values structure for specified samplers
        # to handle any zero values in true_expected_count tensor
        sampled_values = FixedUnigramCandidateSampler(
            sampled_values.sampled_candidates,
            # add smoothing constant EPSILON to handle any zero values
            tf.add(sampled_values.true_expected_count, EPSILON),
            sampled_values.sampled_expected_count,
        )

    def _sampled_loss(labels, logits):
        labels = tf.cast(labels, tf.int64)
        labels = tf.reshape(labels, [-1, 1])
        logits = tf.cast(logits, tf.float32)

        return tf.cast(
            tf.nn.sampled_softmax_loss(
                weights=tf.transpose(decoder_weights),
                biases=decoder_biases,
                labels=labels,
                inputs=logits,
                num_sampled=loss["negative_samples"],
                num_classes=num_classes,
                sampled_values=sampled_values,
            ),
            tf.float32,
        )

    train_loss = tfa.seq2seq.sequence_loss(
        padded_logits,
        padded_targets,
        tf.sequence_mask(
            targets_sequence_length,
            tf.shape(padded_targets)[1],
            dtype=tf.float32,
        ),
        average_across_timesteps=True,
        average_across_batch=False,
        softmax_loss_function=_sampled_loss,
    )

    return train_loss
Esempio n. 50
0
def model_fn(features, labels, mode, params):
    vectors = features['v'] * 3
    mels = features['mel']
    mels_len = features['mel_length'][:, 0]
    dim_neck = 32
    bottleneck = 512
    config = malaya_speech.config.fastspeech_config
    config['encoder_hidden_size'] = bottleneck + 80
    config['decoder_hidden_size'] = bottleneck + dim_neck
    config = fastspeech.Config(vocab_size=1, **config)
    model = fastvc.model.Model(dim_neck, config, dim_speaker=bottleneck)
    encoder_outputs, mel_before, mel_after, codes = model(
        mels, vectors, vectors, mels_len)
    codes_ = model.call_second(mel_after, vectors, mels_len)
    loss_f = tf.losses.absolute_difference
    max_length = tf.cast(tf.reduce_max(mels_len), tf.int32)
    mask = tf.sequence_mask(lengths=mels_len,
                            maxlen=max_length,
                            dtype=tf.float32)
    mask = tf.expand_dims(mask, axis=-1)
    mel_loss_before = loss_f(labels=mels, predictions=mel_before, weights=mask)
    mel_loss_after = loss_f(labels=mels, predictions=mel_after, weights=mask)
    g_loss_cd = tf.losses.absolute_difference(codes, codes_)
    loss = mel_loss_before + mel_loss_after + g_loss_cd

    tf.identity(loss, 'total_loss')
    tf.identity(mel_loss_before, 'mel_loss_before')
    tf.identity(mel_loss_after, 'mel_loss_after')
    tf.identity(g_loss_cd, 'g_loss_cd')

    tf.summary.scalar('total_loss', loss)
    tf.summary.scalar('mel_loss_before', mel_loss_before)
    tf.summary.scalar('mel_loss_after', mel_loss_after)
    tf.summary.scalar('g_loss_cd', g_loss_cd)

    global_step = tf.train.get_or_create_global_step()

    if mode == tf.estimator.ModeKeys.TRAIN:

        train_op = train.optimizer.adamw.create_optimizer(
            loss,
            init_lr=0.001,
            num_train_steps=total_steps,
            num_warmup_steps=int(0.1 * total_steps),
            end_learning_rate=0.00005,
            weight_decay_rate=0.001,
            beta_1=0.9,
            beta_2=0.98,
            epsilon=1e-6,
            clip_norm=1.0,
        )
        estimator_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                    loss=loss,
                                                    train_op=train_op)

    elif mode == tf.estimator.ModeKeys.EVAL:

        estimator_spec = tf.estimator.EstimatorSpec(
            mode=tf.estimator.ModeKeys.EVAL, loss=loss)

    return estimator_spec
    def __init__(self,
                 reversed_dict,
                 article_max_len,
                 summary_max_len,
                 config,
                 forward_only=False):
        self.vocabulary_size = len(reversed_dict)
        self.embedding_size = config['embedding_size']
        self.num_hidden = config['num_hidden']
        self.num_layers = config['num_layers']
        self.learning_rate = config['learning_rate']
        self.beam_width = config['beam_width']
        if not forward_only:
            self.keep_prob = config['keep_prob']
        else:
            self.keep_prob = 1.0
        self.cell = tf.nn.rnn_cell.BasicLSTMCell
        with tf.variable_scope("decoder/projection"):
            self.projection_layer = tf.layers.Dense(self.vocabulary_size,
                                                    use_bias=False)

        self.batch_size = tf.placeholder(tf.int32, (), name="batch_size")
        self.X = tf.placeholder(tf.int32, [None, article_max_len])
        self.X_len = tf.placeholder(tf.int32, [None])
        self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len])
        self.decoder_len = tf.placeholder(tf.int32, [None])
        self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len])
        self.global_step = tf.Variable(0, trainable=False)

        with tf.name_scope("embedding"):
            if not forward_only and config['glove']:
                init_embeddings = tf.constant(get_init_embedding(
                    reversed_dict, self.embedding_size),
                                              dtype=tf.float32)
            else:
                init_embeddings = tf.random_uniform(
                    [self.vocabulary_size, self.embedding_size], -1.0, 1.0)
            self.embeddings = tf.get_variable("embeddings",
                                              initializer=init_embeddings)
            self.encoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(
                self.embeddings, self.X),
                                                perm=[1, 0, 2])
            self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(
                self.embeddings, self.decoder_input),
                                                perm=[1, 0, 2])

        with tf.name_scope("encoder"):
            fw_cells = [
                self.cell(self.num_hidden) for _ in range(self.num_layers)
            ]
            bw_cells = [
                self.cell(self.num_hidden) for _ in range(self.num_layers)
            ]
            fw_cells = [rnn.DropoutWrapper(cell) for cell in fw_cells]
            bw_cells = [rnn.DropoutWrapper(cell) for cell in bw_cells]

            encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                fw_cells,
                bw_cells,
                self.encoder_emb_inp,
                sequence_length=self.X_len,
                time_major=True,
                dtype=tf.float32)
            self.encoder_output = tf.concat(encoder_outputs, 2)
            encoder_state_c = tf.concat(
                (encoder_state_fw[0].c, encoder_state_bw[0].c), 1)
            encoder_state_h = tf.concat(
                (encoder_state_fw[0].h, encoder_state_bw[0].h), 1)
            self.encoder_state = rnn.LSTMStateTuple(c=encoder_state_c,
                                                    h=encoder_state_h)

        with tf.name_scope("decoder"), tf.variable_scope(
                "decoder") as decoder_scope:
            decoder_cell = self.cell(self.num_hidden * 2)

            if not forward_only:
                attention_states = tf.transpose(self.encoder_output, [1, 0, 2])
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2,
                    attention_states,
                    memory_sequence_length=self.X_len,
                    normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                    decoder_cell,
                    attention_mechanism,
                    attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(
                    dtype=tf.float32, batch_size=self.batch_size)
                initial_state = initial_state.clone(
                    cell_state=self.encoder_state)
                helper = tf.contrib.seq2seq.TrainingHelper(
                    self.decoder_emb_inp, self.decoder_len, time_major=True)
                decoder = tf.contrib.seq2seq.BasicDecoder(
                    decoder_cell, helper, initial_state)
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder, output_time_major=True, scope=decoder_scope)
                self.decoder_output = outputs.rnn_output
                self.logits = tf.transpose(self.projection_layer(
                    self.decoder_output),
                                           perm=[1, 0, 2])
                self.logits_reshape = tf.concat([
                    self.logits,
                    tf.zeros([
                        self.batch_size, summary_max_len -
                        tf.shape(self.logits)[1], self.vocabulary_size
                    ])
                ],
                                                axis=1)
            else:
                tiled_encoder_output = tf.contrib.seq2seq.tile_batch(
                    tf.transpose(self.encoder_output, perm=[1, 0, 2]),
                    multiplier=self.beam_width)
                tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
                    self.encoder_state, multiplier=self.beam_width)
                tiled_seq_len = tf.contrib.seq2seq.tile_batch(
                    self.X_len, multiplier=self.beam_width)
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2,
                    tiled_encoder_output,
                    memory_sequence_length=tiled_seq_len,
                    normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                    decoder_cell,
                    attention_mechanism,
                    attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(
                    dtype=tf.float32,
                    batch_size=self.batch_size * self.beam_width)
                initial_state = initial_state.clone(
                    cell_state=tiled_encoder_final_state)
                decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=decoder_cell,
                    embedding=self.embeddings,
                    start_tokens=tf.fill([self.batch_size], tf.constant(2)),
                    end_token=tf.constant(3),
                    initial_state=initial_state,
                    beam_width=self.beam_width,
                    output_layer=self.projection_layer)
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder,
                    output_time_major=True,
                    maximum_iterations=summary_max_len,
                    scope=decoder_scope)
                self.prediction = tf.transpose(outputs.predicted_ids,
                                               perm=[1, 2, 0])

        with tf.name_scope("loss"):
            if not forward_only:
                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits_reshape, labels=self.decoder_target)
                weights = tf.sequence_mask(self.decoder_len,
                                           summary_max_len,
                                           dtype=tf.float32)
                self.loss = tf.reduce_sum(crossent * weights /
                                          tf.to_float(self.batch_size))

                params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, params)
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.update = optimizer.apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step)
Esempio n. 52
0
    def build_graph(self):
        with tf.variable_scope('input'):
            self.inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
            self.targets = tf.placeholder(tf.int32, [None, None], name='targets')
            self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')
            self.target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length')
            self.max_target_sequence_length = tf.reduce_max(self.target_sequence_length, name='max_target_length')
            self.source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length')

        with tf.variable_scope('encoder'):
            encoder_embed_input = tf.contrib.layers.embed_sequence(self.inputs,
                                                                   len(self.source_letter_to_int),
                                                                   self.config.encoding_embedding_size)
            encoder_cell = tf.contrib.rnn.MultiRNNCell(
                [self.get_lstm_cell(self.config.rnn_size) for _ in range(self.config.rnn_layers)])
            encoder_output, encoder_state = tf.nn.dynamic_rnn(encoder_cell,
                                                              encoder_embed_input,
                                                              sequence_length=self.source_sequence_length,
                                                              dtype=tf.float32)

        with tf.variable_scope('decoder'):
            # 1. embedding
            decoder_input = self.process_decoder_input(self.targets,
                                                       self.target_letter_to_int,
                                                       self.config.batch_size)
            target_vocab_size = len(self.target_letter_to_int)
            decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size,
                                                                self.config.decoding_embedding_size]))
            decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)
            # decoder_embed_input = tf.contrib.layers.embed_sequence(decoder_input, target_vocab_size, self.config.decoding_embedding_size)

            # 2. construct the rnn
            num_units = self.config.rnn_size
            attention_states = encoder_output  # tf.transpose(encoder_output, [1, 0, 2])
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units,
                                                                    attention_states,
                                                                    memory_sequence_length=self.source_sequence_length)
            # cells = []
            # for i in range(self.config.rnn_layers):
            #     cell = self.get_lstm_cell(self.config.rnn_size)
            #     cell = tf.contrib.seq2seq.AttentionWrapper(cell,
            #                                                attention_mechanism,
            #                                                attention_layer_size=num_units)
            #     cells.append(cell)
            # decoder_cell = tf.contrib.rnn.MultiRNNCell(cells)

            decoder_cell = tf.contrib.rnn.MultiRNNCell(
                [self.get_lstm_cell(self.config.rnn_size) for _ in range(self.config.rnn_layers)])
            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell,
                                                               attention_mechanism,
                                                               attention_layer_size=num_units)
            attention_zero = decoder_cell.zero_state(self.config.batch_size, dtype=tf.float32)
            initial_state = attention_zero.clone(cell_state=encoder_state)

            # 3. output fully connected
            output_layer = Dense(target_vocab_size,
                                 kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
            if self.mode == 'train':
                training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
                                                                    sequence_length=self.target_sequence_length,
                                                                    time_major=False)
                training_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, training_helper, initial_state,
                                                                   output_layer)
                decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                         impute_finished=True,
                                                                         maximum_iterations=self.max_target_sequence_length)
            else:
                start_tokens = tf.tile(tf.constant([self.target_letter_to_int[GO]], dtype=tf.int32),
                                       [self.config.batch_size],
                                       name='start_tokens')
                predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,
                                                                             start_tokens,
                                                                             self.target_letter_to_int[EOS])
                predicting_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,
                                                                     predicting_helper,
                                                                     initial_state,
                                                                     output_layer)
                decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(predicting_decoder,
                                                                         impute_finished=True,
                                                                         maximum_iterations=self.max_target_sequence_length)

        with tf.variable_scope('loss'):
            training_logits = tf.identity(decoder_output.rnn_output, 'logits')
            predicting_logits = tf.identity(decoder_output.sample_id, name='predictions')
            masks = tf.sequence_mask(self.target_sequence_length, self.max_target_sequence_length, dtype=tf.float32,
                                     name='masks')
            self.loss = tf.contrib.seq2seq.sequence_loss(training_logits, self.targets, masks)
            tf.summary.scalar("loss", self.loss)

        with tf.name_scope('optimize'):
            # optimizer = tf.train.AdamOptimizer(lr)
            # gradients = optimizer.compute_gradients(cost)
            # capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
            # train_op = optimizer.apply_gradients(capped_gradients)
            training_variables = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, training_variables), 5)
            optimizer = tf.train.AdamOptimizer(self.learning_rate)
            self.train_op = optimizer.apply_gradients(zip(grads, training_variables), name='train_op')
def train_HT():
    print('Run HT chord recognition on %s-%d...' %
          (hp.dataset, hp.test_set_id))

    # Load training and testing data
    train_data, test_data = load_data_symbol(
        dir=hp.dataset + '_preprocessed_data_MIREX_Mm.pickle',
        test_set_id=hp.test_set_id,
        sequence_with_overlap=hp.train_sequence_with_overlap)
    n_train_sequences = train_data['pianoroll'].shape[0]
    n_test_sequences = test_data['pianoroll'].shape[0]
    n_iterations_per_epoch = int(math.ceil(n_train_sequences / hp.n_batches))
    print('n_train_sequences =', n_train_sequences)
    print('n_test_sequences =', n_test_sequences)
    print('n_iterations_per_epoch =', n_iterations_per_epoch)
    print(hp)

    with tf.name_scope('placeholder'):
        x_p = tf.placeholder(tf.int32, [None, hp.n_steps, 88],
                             name="pianoroll")
        x_len = tf.placeholder(tf.int32, [None], name="seq_lens")
        y_tc = tf.placeholder(tf.int32, [None, hp.n_steps], name="tchord")
        y_cc = tf.placeholder(tf.int32, [None, hp.n_steps],
                              name="chord_change")
        dropout = tf.placeholder(dtype=tf.float32, name="dropout_rate")
        is_training = tf.placeholder(dtype=tf.bool, name="is_training")
        global_step = tf.placeholder(dtype=tf.int32, name='global_step')
        slope = tf.placeholder(dtype=tf.float32, name='annealing_slope')

    with tf.name_scope('model'):
        x_in = tf.cast(x_p, tf.float32)
        source_mask = tf.sequence_mask(
            lengths=x_len, maxlen=hp.n_steps,
            dtype=tf.float32)  # [n_batches, n_steps]
        target_mask = source_mask
        # chord_change_logits, dec_input_embed, enc_weights, dec_weights = crm.HT(x_in, source_mask, target_mask, slope, dropout, is_training, hp)
        chord_change_logits, dec_input_embed, enc_weights, dec_weights, _, _ = crm.HTv2(
            x_in, source_mask, target_mask, slope, dropout, is_training, hp)

    with tf.variable_scope("output_projection"):
        dec_input_embed = tf.layers.dropout(dec_input_embed,
                                            rate=dropout,
                                            training=is_training)
        chord_logits = tf.layers.dense(dec_input_embed,
                                       hp.n_chord_classes,
                                       name='output_dense')

    with tf.name_scope('loss'):
        # Chord change
        loss_cc = 1.5 * tf.losses.sigmoid_cross_entropy(
            multi_class_labels=tf.cast(y_cc, tf.float32),
            logits=slope * chord_change_logits,
            weights=source_mask)

        # Chord symbol
        loss_tc = tf.losses.softmax_cross_entropy(onehot_labels=tf.one_hot(
            y_tc, hp.n_chord_classes),
                                                  logits=chord_logits,
                                                  weights=target_mask)

        # Total loss
        loss = loss_cc + loss_tc
    valid = tf.reduce_sum(target_mask)
    summary_loss = tf.Variable([0.0, 0.0, 0.0],
                               trainable=False,
                               dtype=tf.float32)
    summary_valid = tf.Variable(0, trainable=False, dtype=tf.float32)
    update_loss = tf.assign(summary_loss,
                            summary_loss + valid * [loss, loss_cc, loss_tc])
    update_valid = tf.assign(summary_valid, summary_valid + valid)
    mean_loss = tf.assign(summary_loss, summary_loss / summary_valid)
    clr_summary_loss = summary_loss.initializer
    clr_summary_valid = summary_valid.initializer
    tf.summary.scalar('Loss_total', summary_loss[0])
    tf.summary.scalar('Loss_chord_change', summary_loss[1])
    tf.summary.scalar('Loss_chord', summary_loss[2])

    with tf.name_scope('evaluation'):
        chord_mask = tf.cast(target_mask, tf.bool)
        chord_mask = tf.logical_and(chord_mask,
                                    tf.less(y_tc, tquality_dict['O'] * 12))

        # Chord change
        pred_cc = tf.cast(tf.round(tf.sigmoid(slope * chord_change_logits)),
                          tf.int32)
        pred_cc_mask = tf.boolean_mask(pred_cc, tf.cast(source_mask, tf.bool))
        y_cc_mask = tf.boolean_mask(y_cc, tf.cast(source_mask, tf.bool))
        TP_cc, FP_cc, FN_cc = compute_pre_PRF(pred_cc_mask, y_cc_mask)

        # Chord
        pred_tc = tf.argmax(chord_logits, axis=2, output_type=tf.int32)
        pred_tc_correct = tf.equal(pred_tc, y_tc)
        pred_tc_correct_mask = tf.boolean_mask(tensor=pred_tc_correct,
                                               mask=chord_mask)
        correct = tf.reduce_sum(tf.cast(pred_tc_correct_mask, tf.float32))
        total = tf.cast(tf.size(pred_tc_correct_mask), tf.float32)
    summary_count = tf.Variable([0.0 for _ in range(5)],
                                trainable=False,
                                dtype=tf.float32)
    summary_score = tf.Variable([0.0 for _ in range(4)],
                                trainable=False,
                                dtype=tf.float32)
    update_count = tf.assign(
        summary_count, summary_count + [correct, total, TP_cc, FP_cc, FN_cc])
    acc_tc = summary_count[0] / summary_count[1]
    P_cc, R_cc, F1_cc = comput_PRF_with_pre(summary_count[2], summary_count[3],
                                            summary_count[4])
    update_score = tf.assign(summary_score, summary_score + [
        acc_tc,
        P_cc,
        R_cc,
        F1_cc,
    ])
    clr_summary_count = summary_count.initializer
    clr_summary_score = summary_score.initializer
    tf.summary.scalar('Accuracy_tchord', summary_score[0])
    tf.summary.scalar('Precision_chord_change', summary_score[1])
    tf.summary.scalar('Recall_chord_change', summary_score[2])
    tf.summary.scalar('F1_chord_change', summary_score[3])

    with tf.name_scope('optimization'):
        # Apply warm-up learning rate
        warm_up_steps = tf.constant(4000, dtype=tf.float32)
        gstep = tf.cast(global_step, dtype=tf.float32)
        learning_rate = pow(hp.input_embed_size, -0.5) * tf.minimum(
            tf.pow(gstep, -0.5), gstep * tf.pow(warm_up_steps, -1.5))
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           beta1=0.9,
                                           beta2=0.98,
                                           epsilon=1e-9)
        train_op = optimizer.minimize(loss)

    # Graph location and summary writers
    print('Saving graph to: %s' % hp.graph_location)
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(hp.graph_location + '\\train')
    test_writer = tf.summary.FileWriter(hp.graph_location + '\\test')
    train_writer.add_graph(tf.get_default_graph())
    test_writer.add_graph(tf.get_default_graph())
    saver = tf.train.Saver(max_to_keep=1)

    # Training
    print('Train the model...')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        startTime = time.time()
        best_score = [0.0 for _ in range(5)]
        in_succession = 0
        best_epoch = 0
        annealing_slope = 1.0
        best_slope = 0.0
        for step in range(hp.n_training_steps):
            # Training
            if step == 0:
                indices = range(n_train_sequences)
                batch_indices = [
                    indices[x:x + hp.n_batches]
                    for x in range(0, len(indices), hp.n_batches)
                ]

            if step > 0 and step % n_iterations_per_epoch == 0:
                annealing_slope *= hp.annealing_rate

            if step >= n_iterations_per_epoch and step % n_iterations_per_epoch == 0:
                # Shuffle training data
                indices = random.sample(range(n_train_sequences),
                                        n_train_sequences)
                batch_indices = [
                    indices[x:x + hp.n_batches]
                    for x in range(0, len(indices), hp.n_batches)
                ]

            batch = (
                train_data['pianoroll'][batch_indices[step %
                                                      len(batch_indices)]],
                train_data['len'][batch_indices[step % len(batch_indices)]],
                train_data['label']['chord_change'][batch_indices[
                    step % len(batch_indices)]],
                train_data['tchord'][batch_indices[step % len(batch_indices)]],
                train_data['root'][batch_indices[step % len(batch_indices)]],
                train_data['tquality'][batch_indices[step %
                                                     len(batch_indices)]])

            train_run_list = [
                train_op, update_valid, update_loss, update_count, loss,
                loss_cc, loss_tc, pred_cc, pred_tc, chord_mask, enc_weights,
                dec_weights
            ]
            train_feed_fict = {
                x_p: batch[0],
                x_len: batch[1],
                y_cc: batch[2],
                y_tc: batch[3],
                dropout: hp.drop,
                is_training: True,
                global_step: step + 1,
                slope: annealing_slope
            }
            _, _, _, _, train_loss, train_loss_cc, train_loss_tc, train_pred_cc, train_pred_tc, train_chord_mask, enc_w, dec_w = sess.run(
                train_run_list, feed_dict=train_feed_fict)
            if step == 0:
                print('*~ loss_cc %.4f, loss_tc %.4f ~*' %
                      (train_loss_cc, train_loss_tc))

            # Display training log & Testing
            if step > 0 and step % n_iterations_per_epoch == 0:
                sess.run([mean_loss, update_score])
                train_summary, train_loss, train_score = sess.run(
                    [merged, summary_loss, summary_score])
                sess.run([
                    clr_summary_valid, clr_summary_loss, clr_summary_count,
                    clr_summary_score
                ])
                train_writer.add_summary(train_summary, step)
                print(
                    "---- step %d, epoch %d: train_loss: total %.4f, cc %.4f, tc %.4f, evaluation: tc %.4f, cc (P %.4f, R %.4f, F1 %.4f) ----"
                    % (step, step // n_iterations_per_epoch, train_loss[0],
                       train_loss[1], train_loss[2], train_score[0],
                       train_score[1], train_score[2], train_score[3]))
                print('enc_w =', enc_w, 'dec_w =', dec_w)
                display_len = 64
                print('len =', batch[1][0])
                print(
                    'y_root'.ljust(7, ' '),
                    ''.join([[k for k, v in root_dict.items()
                              if v == b][0].rjust(3, ' ')
                             for b in batch[4][0, :display_len]]))
                print(
                    'y_tq'.ljust(7, ' '),
                    ''.join([[k for k, v in tquality_dict.items()
                              if v == b][0].rjust(3, ' ')
                             for b in batch[5][0, :display_len]]))
                print(
                    'valid'.ljust(7, ' '), ''.join([
                        'y'.rjust(3, ' ') if b else 'n'.rjust(3, ' ')
                        for b in train_chord_mask[0, :display_len]
                    ]))
                print(
                    'y_cc'.ljust(7, ' '), ''.join([
                        str(b).rjust(3, ' ') for b in batch[2][0, :display_len]
                    ]))
                print(
                    'pred_cc'.ljust(7, ' '), ''.join([
                        str(b).rjust(3, ' ')
                        for b in train_pred_cc[0, :display_len]
                    ]))
                print(
                    'y_tc'.ljust(7, ' '), ''.join([
                        str(b).rjust(3, ' ') for b in batch[3][0, :display_len]
                    ]))
                print(
                    'pred_tc'.ljust(7, ' '), ''.join([
                        str(b).rjust(3, ' ')
                        for b in train_pred_tc[0, :display_len]
                    ]))

                # Testing
                test_run_list = [
                    update_valid, update_loss, update_count, pred_cc, pred_tc,
                    chord_mask
                ]
                test_feed_fict = {
                    x_p: test_data['pianoroll'],
                    x_len: test_data['len'],
                    y_cc: test_data['label']['chord_change'],
                    y_tc: test_data['tchord'],
                    dropout: 0.0,
                    is_training: False,
                    slope: annealing_slope
                }
                _, _, _, test_pred_cc, test_pred_tc, test_chord_mask = sess.run(
                    test_run_list, feed_dict=test_feed_fict)
                sess.run([mean_loss, update_score])
                test_summary, test_loss, test_score = sess.run(
                    [merged, summary_loss, summary_score])
                sess.run([
                    clr_summary_valid, clr_summary_loss, clr_summary_count,
                    clr_summary_score
                ])
                test_writer.add_summary(test_summary, step)

                sq = crm.segmentation_quality(test_data['tchord'],
                                              test_pred_tc, test_data['len'])
                print(
                    "==== step %d, epoch %d: test_loss: total %.4f, cc %.4f, tc %.4f, evaluation: tc %.4f, cc (P %.4f, R %.4f, F1 %.4f), sq %.4f ===="
                    % (step, step // n_iterations_per_epoch, test_loss[0],
                       test_loss[1], test_loss[2], test_score[0],
                       test_score[1], test_score[2], test_score[3], sq))
                sample_id = random.randint(0, n_test_sequences - 1)
                print('len =', test_data['len'][sample_id])
                print(
                    'y_root'.ljust(7, ' '), ''.join(
                        [[k for k, v in root_dict.items()
                          if v == b][0].rjust(3, ' ')
                         for b in test_data['root'][sample_id, :display_len]]))
                print(
                    'y_tq'.ljust(7, ' '), ''.join([
                        [k for k, v in tquality_dict.items()
                         if v == b][0].rjust(3, ' ')
                        for b in test_data['tquality'][sample_id, :display_len]
                    ]))
                print(
                    'valid'.ljust(7, ' '), ''.join([
                        'y'.rjust(3, ' ') if b else 'n'.rjust(3, ' ')
                        for b in test_chord_mask[sample_id, :display_len]
                    ]))
                print(
                    'y_cc'.ljust(7, ' '), ''.join([
                        str(b).rjust(3, ' ') for b in test_data['label']
                        ['chord_change'][sample_id, :display_len]
                    ]))
                print(
                    'pred_cc'.ljust(7, ' '), ''.join([
                        str(b).rjust(3, ' ')
                        for b in test_pred_cc[sample_id, :display_len]
                    ]))
                print(
                    'y_tc'.ljust(7, ' '), ''.join([
                        str(b).rjust(3, ' ')
                        for b in test_data['tchord'][sample_id, :display_len]
                    ]))
                print(
                    'pred_tc'.ljust(7, ' '), ''.join([
                        str(b).rjust(3, ' ')
                        for b in test_pred_tc[sample_id, :display_len]
                    ]))

                if step > 0 and (test_score[0] + sq) > (best_score[0] +
                                                        best_score[-1]):
                    best_score = np.concatenate([test_score, [sq]], axis=0)
                    best_epoch = step // n_iterations_per_epoch
                    best_slope = annealing_slope
                    in_succession = 0

                    # Save variables of the model
                    print('*saving variables...\n')
                    saver.save(
                        sess, hp.graph_location + '\\HT_chord_recognition_' +
                        hp.dataset + '_' + str(hp.test_set_id) + '.ckpt')
                else:
                    in_succession += 1
                    if in_succession > hp.n_in_succession:
                        print('Early stopping.')
                        break

        # saver.save(sess, hp.graph_location + '\\HT_chord_recognition_train_model.ckpt')
        elapsed_time = time.time() - startTime
        print('\nHT chord symbol recognition on %s-%d:' %
              (hp.dataset, hp.test_set_id))
        print('training time = %.2f hr' % (elapsed_time / 3600))
        print('best epoch = ', best_epoch)
        print('best score =', np.round(best_score, 4))
        print('best slope =', best_slope)
Esempio n. 54
0
def model_fn(features, labels, mode, params):
    # For serving, features are a bit different
    if isinstance(features, dict):
        features = features['words'], features['nwords']

    # Read vocabs and inputs
    dropout = params['dropout']
    words, nwords = features
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    vocab_words = tf.contrib.lookup.index_table_from_file(
        params['words'], num_oov_buckets=params['num_oov_buckets'])
    with Path(params['tags']).open() as f:
        indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O']
        num_tags = len(indices) + 1

    # Word Embeddings
    word_ids = vocab_words.lookup(words)
    glove = np.load(params['glove'])['embeddings']  # np.array
    variable = np.vstack([glove, [[0.] * params['dim']]])
    variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
    embeddings = tf.nn.embedding_lookup(variable, word_ids)
    embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)

    # LSTM
    t = tf.transpose(embeddings, perm=[1, 0, 2])
    lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
    output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
    output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
    output = tf.concat([output_fw, output_bw], axis=-1)
    output = tf.transpose(output, perm=[1, 0, 2])
    output = tf.layers.dropout(output, rate=dropout, training=training)

    # CRF
    logits = tf.layers.dense(output, num_tags)
    crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32)
    pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Predictions
        reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_file(
            params['tags'])
        pred_strings = reverse_vocab_tags.lookup(tf.to_int64(pred_ids))
        predictions = {'pred_ids': pred_ids, 'tags': pred_strings}
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
        # Loss
        vocab_tags = tf.contrib.lookup.index_table_from_file(params['tags'])
        tags = vocab_tags.lookup(labels)
        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
            logits, tags, nwords, crf_params)
        loss = tf.reduce_mean(-log_likelihood)

        # Metrics
        weights = tf.sequence_mask(nwords)
        metrics = {
            'acc': tf.metrics.accuracy(tags, pred_ids, weights),
            'precision': precision(tags, pred_ids, num_tags, indices, weights),
            'recall': recall(tags, pred_ids, num_tags, indices, weights),
            'f1': f1(tags, pred_ids, num_tags, indices, weights),
        }
        for metric_name, op in metrics.items():
            tf.summary.scalar(metric_name, op[1])

        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(mode,
                                              loss=loss,
                                              eval_metric_ops=metrics)

        elif mode == tf.estimator.ModeKeys.TRAIN:
            train_op = tf.train.AdamOptimizer().minimize(
                loss, global_step=tf.train.get_or_create_global_step())
            return tf.estimator.EstimatorSpec(mode,
                                              loss=loss,
                                              train_op=train_op)
Esempio n. 55
0
    def build_decoder(self):
        """构建解码器
        """
        with tf.variable_scope('decoder') as decoder_scope:
            # Building decoder_cell and decoder_initial_state
            (self.decoder_cell,
             self.decoder_initial_state) = self.build_decoder_cell()

            # 解码器embedding
            if self.share_embedding:
                self.decoder_embeddings = self.encoder_embeddings
            else:
                with tf.device(_get_embed_device(self.target_vocab_size)):
                    self.decoder_embeddings = tf.get_variable(
                        name='embeddings',
                        shape=(self.target_vocab_size, self.embedding_size),
                        initializer=self.initializer,
                        dtype=tf.float32)

            # On Using Very Large Target Vocabulary
            # for Neural Machine Translation
            # https://arxiv.org/pdf/1412.2007v2.pdf

            # Input projection layer to feed embedded inputs to the cell
            # ** Essential when use_residual=True to match input/output dims
            hidden_units = self.hidden_units
            if self.bidirectional:
                hidden_units *= 2

            input_layer = layers.Dense(hidden_units,
                                       dtype=tf.float32,
                                       use_bias=False,
                                       name='input_projection')

            self.output_layer = layers.Dense(self.target_vocab_size,
                                             dtype=tf.float32,
                                             use_bias=False,
                                             name='output_projection')

            if self.mode == 'train':
                # decoder_inputs_embedded:
                # [batch_size, max_time_step + 1, embedding_size]
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings,
                    ids=self.decoder_inputs_train)

                # Embedded inputs having gone through input projection layer
                self.decoder_inputs_embedded = input_layer(
                    self.decoder_inputs_embedded)

                # Helper to feed inputs for training:
                # read inputs from dense ground truth vectors
                inputs = self.decoder_inputs_embedded
                if self.time_major:
                    inputs = tf.transpose(inputs, (1, 0, 2))

                training_helper = seq2seq.TrainingHelper(
                    inputs=inputs,
                    sequence_length=self.decoder_inputs_length_train,
                    time_major=self.time_major,
                    name='training_helper')

                # 训练的时候不在这里应用 output_layer
                # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢
                # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数
                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state,
                    # output_layer=self.output_layer
                )

                # Maximum decoder time_steps in current batch
                max_decoder_length = tf.reduce_max(
                    self.decoder_inputs_length_train)

                # decoder_outputs_train: BasicDecoderOutput
                #     namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_train.rnn_output:
                #     if output_time_major=False:
                #         [batch_size, max_time_step + 1, num_decoder_symbols]
                #     if output_time_major=True:
                #         [max_time_step + 1, batch_size, num_decoder_symbols]
                # decoder_outputs_train.sample_id: [batch_size], tf.int32

                (
                    outputs,
                    self.final_state,  # contain attention
                    _  # self.final_sequence_lengths
                ) = seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    output_time_major=self.time_major,
                    impute_finished=True,
                    maximum_iterations=max_decoder_length,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope)

                # More efficient to do the projection
                # on the batch-time-concatenated tensor
                # logits_train:
                # [batch_size, max_time_step + 1, num_decoder_symbols]
                # 训练的时候一次性对所有的结果进行 output_layer 的投影运算
                # 官方NMT库说这样能提高10~20%的速度
                # 实际上我提高的速度会更大
                self.decoder_logits_train = self.output_layer(
                    outputs.rnn_output)

                # masks: masking for valid and padded time steps,
                # [batch_size, max_time_step + 1]
                self.masks = tf.sequence_mask(
                    lengths=self.decoder_inputs_length_train,
                    maxlen=max_decoder_length,
                    dtype=tf.float32,
                    name='masks')

                # Computes per word average cross-entropy over a batch
                # Internally calls
                # 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default

                decoder_logits_train = self.decoder_logits_train
                if self.time_major:
                    decoder_logits_train = tf.transpose(
                        decoder_logits_train, (1, 0, 2))

                self.decoder_pred_train = tf.argmax(decoder_logits_train,
                                                    axis=-1,
                                                    name='decoder_pred_train')

                # 下面的一些变量用于强化学习训练
                self.train_entropy = \
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        labels=self.decoder_targets_train,
                        logits=decoder_logits_train)
                # self.train_entropy *= self.masks
                # print(self.train_entropy.shape)
                self.train_entropy_rewards = tf.multiply(
                    self.train_entropy, self.rewards)
                # print('self.train_entropy_rewards.shape', self.train_entropy_rewards.shape)
                self.train_entropy_rewards *= self.masks

                # https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/contrib/seq2seq/python/ops/loss.py
                # if average_across_timesteps and average_across_batch:
                #   crossent = math_ops.reduce_sum(crossent)
                #   total_size = math_ops.reduce_sum(weights)
                #   total_size += 1e-12  # to avoid division by 0 for all-0 weights
                #   crossent /= total_size

                self.loss_without_rewards = tf.reduce_sum(self.train_entropy)
                self.loss_rewards = tf.reduce_sum(self.train_entropy_rewards)

                total_size = tf.reduce_sum(self.masks)
                total_size += 1e-12
                self.loss_without_rewards /= total_size
                self.loss_rewards /= total_size

                self.loss = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_targets_train,
                    weights=self.masks,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                # Training summary for the current batch_loss
                tf.summary.scalar('loss', self.loss)

            elif self.mode == 'decode':
                # 预测模式,非训练

                start_tokens = tf.fill([self.batch_size], WordSequence.START)
                end_token = WordSequence.END

                def embed_and_input_proj(inputs):
                    """输入层的投影层wrapper
                    """
                    return input_layer(
                        tf.nn.embedding_lookup(self.decoder_embeddings,
                                               inputs))

                if not self.use_beamsearch_decode:
                    # Helper to feed inputs for greedy decoding:
                    # uses the argmax of the output
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_tokens,
                        end_token=end_token,
                        embedding=embed_and_input_proj)
                    # Basic decoder performs greedy decoding at each time step
                    # print("building greedy decoder..")
                    inference_decoder = seq2seq.BasicDecoder(
                        cell=self.decoder_cell,
                        helper=decoding_helper,
                        initial_state=self.decoder_initial_state,
                        output_layer=self.output_layer)
                else:
                    # Beamsearch is used to approximately
                    # find the most likely translation
                    # print("building beamsearch decoder..")
                    inference_decoder = BeamSearchDecoder(
                        cell=self.decoder_cell,
                        embedding=embed_and_input_proj,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=self.decoder_initial_state,
                        beam_width=self.beam_width,
                        output_layer=self.output_layer,
                    )

                # For GreedyDecoder, return
                # decoder_outputs_decode: BasicDecoderOutput instance
                #     namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_decode.rnn_output:
                # if output_time_major=False:
                #     [batch_size, max_time_step, num_decoder_symbols]
                # if output_time_major=True
                #     [max_time_step, batch_size, num_decoder_symbols]
                # decoder_outputs_decode.sample_id:
                # if output_time_major=False
                #     [batch_size, max_time_step], tf.int32
                # if output_time_major=True
                #     [max_time_step, batch_size], tf.int32

                # For BeamSearchDecoder, return
                # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance
                #     namedtuple(predicted_ids, beam_search_decoder_output)
                # decoder_outputs_decode.predicted_ids:
                # if output_time_major=False:
                #     [batch_size, max_time_step, beam_width]
                # if output_time_major=True
                #     [max_time_step, batch_size, beam_width]
                # decoder_outputs_decode.beam_search_decoder_output:
                #     BeamSearchDecoderOutput instance
                #     namedtuple(scores, predicted_ids, parent_ids)

                # 官方文档提到的一个潜在的最大长度选择
                # maximum_iterations = tf.round(tf.reduce_max(source_sequence_length) * 2)
                # https://www.tensorflow.org/tutorials/seq2seq

                if self.max_decode_step is not None:
                    max_decode_step = self.max_decode_step
                else:
                    # 默认 4 倍输入长度的输出解码
                    max_decode_step = tf.round(
                        tf.reduce_max(self.encoder_inputs_length) * 4)

                (
                    self.decoder_outputs_decode,
                    self.final_state,
                    _  # self.decoder_outputs_length_decode
                ) = (
                    seq2seq.dynamic_decode(
                        decoder=inference_decoder,
                        output_time_major=self.time_major,
                        # impute_finished=True,	# error occurs
                        maximum_iterations=max_decode_step,
                        parallel_iterations=self.parallel_iterations,
                        swap_memory=True,
                        scope=decoder_scope))

                if not self.use_beamsearch_decode:
                    # decoder_outputs_decode.sample_id:
                    #     [batch_size, max_time_step]
                    # Or use argmax to find decoder symbols to emit:
                    # self.decoder_pred_decode = tf.argmax(
                    #     self.decoder_outputs_decode.rnn_output,
                    #     axis=-1, name='decoder_pred_decode')

                    # Here, we use expand_dims to be compatible with
                    # the result of the beamsearch decoder
                    # decoder_pred_decode:
                    #     [batch_size, max_time_step, 1] (output_major=False)

                    # self.decoder_pred_decode = tf.expand_dims(
                    #     self.decoder_outputs_decode.sample_id,
                    #     -1
                    # )

                    dod = self.decoder_outputs_decode
                    self.decoder_pred_decode = dod.sample_id

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0))

                else:
                    # Use beam search to approximately
                    # find the most likely translation
                    # decoder_pred_decode:
                    # [batch_size, max_time_step, beam_width] (output_major=False)
                    self.decoder_pred_decode = \
                        self.decoder_outputs_decode.predicted_ids

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0, 2))

                    self.decoder_pred_decode = tf.transpose(
                        self.decoder_pred_decode, perm=[0, 2, 1])
                    dod = self.decoder_outputs_decode
                    self.beam_prob = dod.beam_search_decoder_output.scores
Esempio n. 56
0
    def build(self):
        print("Building the language model ... ")

        vocab_size = self.vocab_size
        state_size = self.state_size
        enc_layers = self.enc_layers

        with tf.name_scope("placeholders"):
            enc_inputs = tf.placeholder(tf.int32, [None, None], "enc_inputs")
            targets = tf.placeholder(tf.int32, [None, None], "targets")
            inp_lens = tf.placeholder(tf.int32, [None], "inp_lens")
            self.drop_out = tf.placeholder(tf.float32, (), "drop_out")

            self.enc_inputs = enc_inputs
            self.inp_lens = inp_lens
            self.targets = targets

        batch_size = tf.shape(enc_inputs)[0]
        max_len = tf.shape(enc_inputs)[1]

        with tf.variable_scope("embeddings"):
            embedding_matrix = tf.get_variable("embedding_matrix",
                                               [vocab_size, state_size])
            enc_inputs = tf.nn.embedding_lookup(embedding_matrix, enc_inputs)

        with tf.variable_scope("encoder"):
            # TODO: residual LSTM, layer normalization
            enc_cell = [
                create_cell("enc-%d" % i, state_size, self.drop_out)
                for i in range(enc_layers)
            ]
            enc_cell = tf.nn.rnn_cell.MultiRNNCell(enc_cell)
            enc_outputs, enc_state = tf.nn.dynamic_rnn(
                enc_cell,
                enc_inputs,
                sequence_length=inp_lens,
                dtype=tf.float32)

            enc_proj = tf.layers.Dense(vocab_size, name="enc_proj")
            enc_logits = enc_proj(enc_outputs)

            mask = tf.sequence_mask(inp_lens, max_len, dtype=tf.float32)
            loss = tf.contrib.seq2seq.sequence_loss(enc_logits, targets, mask)

            # get variables before optimizer
            all_variables = slim.get_variables_to_restore()
            lm_variables = [
                var for var in all_variables if var.name[:2] == "lm"
            ]
            print("lm model, variable list:")
            for v in lm_variables:
                print("  %s" % v.name)
            self.model_saver = tf.train.Saver(lm_variables, max_to_keep=10)

            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.learning_rate)
            train_op = optimizer.minimize(loss)

            self.train_output = {
                "train_op": train_op,
                "loss": loss,
                "ppl": tf.exp(loss)
            }
            self.eval_output = {"loss": loss, "ppl": tf.exp(loss)}

        return
    def get_loss_and_rewards(self):
        """
		run the rnn for one time
		:return: cross entropy loss and rewards
		"""
        with tf.name_scope('lstm'):
            with tf.variable_scope('cell', reuse=False):

                def get_cell(hiddenSize, dropOutRate):
                    print('Not using ACL style jumping!')
                    cell = SkipLSTMCell(num_units=hiddenSize,
                                        state_is_tuple=True,
                                        min_read=self.args.minRead,
                                        max_skip=self.args.maxSkip,
                                        is_training=self.is_training,
                                        is_transfering=self.is_transfering)
                    cell = tf.contrib.rnn.DropoutWrapper(
                        cell,
                        input_keep_prob=dropOutRate,
                        output_keep_prob=dropOutRate)
                    return cell

                # https://stackoverflow.com/questions/47371608/cannot-stack-lstm-with-multirnncell-and-dynamic-rnn
                cell = get_cell(self.args.hiddenSize, self.dropOutRate)

            state = self.init_state()

            outputs = []
            skips_remain = []
            n_skips = []
            probs = []
            valid = []

            predicted_logits = []

            with tf.variable_scope("loop", reuse=tf.AUTO_REUSE):
                for time_step in range(self.args.maxSteps):
                    # state:
                    # "c", "h", "r", "s", "n", "probs", "valid"
                    (cell_output, state) = cell(self.embedded[:, time_step, :],
                                                state)
                    # n: number of steps skipped
                    # p: corresponding probs of n
                    # v: if is valid for computing reward
                    # all of shape [batch_size]
                    # predicted_logits_: [batch_size*n_samples*(max_skips+1)]
                    (c, h, r, s, n, p, v, _, predicted_logits_) = state
                    skips_remain.append(s)
                    n_skips.append(n)
                    probs.append(p)
                    valid.append(v)
                    outputs.append(cell_output)
                    induced_n = tf.slice(self.induced_skips,
                                         begin=[0, time_step + 1],
                                         size=[-1, 1],
                                         name='induced_n' + str(time_step + 1))
                    induced_n = tf.reshape(induced_n, shape=[-1])
                    state = SkipLSTMStateTuple(c, h, r, s, n, p, v, induced_n,
                                               predicted_logits_)
                    #predicted_logits_ = tf.reshape(predicted_logits_, shape=[self.batch_size*self.n_samples, self.args.maxSkip+1])
                    predicted_logits.append(predicted_logits_)

            # [maxSteps, batch_size]
            skips_remain.insert(
                0,
                tf.zeros(shape=[self.batch_size * self.n_samples],
                         dtype=tf.int32))
            skips_remain = skips_remain[0:-1]
            skips_remain = tf.stack(skips_remain)
            n_skips = tf.stack(n_skips)
            probs = tf.stack(probs)
            valid = tf.stack(valid)
            # [max_steps, batch_size*n_samples, max_skips+1]
            predicted_logits = tf.stack(predicted_logits)

            # [batch_size, maxSteps]
            skip_flag = tf.cast(tf.greater(tf.transpose(skips_remain, [1, 0]),
                                           0),
                                tf.float32,
                                name='skip_flag')
            #skip_flag = tf.Print(skip_flag, data=[skip_flag], summarize=100, message='skp_flag')
            n_skips = tf.transpose(n_skips, [1, 0], name='n_skips')
            probs = tf.transpose(probs, [1, 0], name='probs')
            valid = tf.transpose(valid, [1, 0], name='valid')

            # [batch_size*n_samples, max_steps, max_skips+1]
            predicted_logits = tf.transpose(predicted_logits, [1, 0, 2],
                                            name='predicted_logits')

            # [maxSteps, batchSize, hiddenSize]
            outputs = tf.stack(outputs)
            # [batchSize, maxSteps, hiddenSize]
            outputs = tf.transpose(outputs, [1, 0, 2], name='outputs')

            # [batchSize, maxSteps]
            last_relevant_mask = tf.one_hot(indices=self.length - 1,
                                            depth=self.args.maxSteps,
                                            name='last_relevant',
                                            dtype=tf.int32)
            # [batchSize, hiddenSize]
            last_relevant_outputs = tf.boolean_mask(
                outputs, last_relevant_mask, name='last_relevant_outputs')

        with tf.name_scope('output'):
            weights = tf.get_variable(
                name='weights',
                shape=[self.args.hiddenSize, self.args.numClasses],
                initializer=self.initializer)

            biases = tf.get_variable(name='biases',
                                     shape=[self.args.numClasses],
                                     initializer=self.initializer)
            # [batchSize, numClasses]
            logits = tf.nn.xw_plus_b(x=last_relevant_outputs,
                                     weights=weights,
                                     biases=biases)

        with tf.name_scope('rewards'):
            # [batch_size]
            self.predictions = tf.argmax(logits,
                                         axis=-1,
                                         name='predictions',
                                         output_type=tf.int32)
            # [batch_size]
            self.corrects = tf.equal(self.predictions,
                                     self.labels,
                                     name='corrects')
            self.wrongs = tf.logical_not(self.corrects, name='wrongs')

            # single number
            n_corrects = tf.reduce_sum(tf.cast(self.corrects, tf.int32),
                                       name='n_corrects')

            # [batch_size], with elements 1 or -1, 1 for corrects and -1 for wrongs
            rewards = tf.subtract(tf.cast(self.corrects, tf.float32),
                                  tf.cast(self.wrongs, tf.float32),
                                  name='rewards')
            # rewards = tf.Print(rewards, data=[rewards], message='rewards')
        with tf.name_scope('ce_loss'):
            # [batch_size*n_samples]
            ce_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=self.labels, name='loss')

        with tf.name_scope('transfering_loss'):
            # mask out steps exceeding the length of each sample
            # [batch_size, n_samples]
            valid = tf.cast(valid, tf.float32)
            valid = tf.reshape(
                valid, [self.batch_size, self.n_samples, self.args.maxSteps])
            length = tf.reshape(self.length,
                                shape=[self.batch_size, self.n_samples],
                                name='length')
            # [batch_size, n_samples, maxSteps]
            # note that a jump decision made at the last word is not valid
            # and, in our current mechanism, a sentence with length <= min_read does not have valid predictions
            valid_mask = tf.sequence_mask(lengths=length - 1,
                                          maxlen=self.args.maxSteps,
                                          dtype=tf.float32,
                                          name='valid_mask')
            valid = tf.multiply(valid, valid_mask, name='valid')
            valid = tf.reshape(valid, shape=[-1, self.args.maxSteps])
            # predicted_logits: [batch_size*n_samples, max_steps, max_skips+1]
            # induced_skips: [batch_size*n_samples, max_steps]
            # valid: [batch_size*n_samples, max_steps]

            # transfering_loss: [batch_size*n_samples]
            transfering_loss = tf.contrib.seq2seq.sequence_loss(
                logits=predicted_logits,
                targets=self.induced_skips,
                weights=valid,
                average_across_timesteps=True,
                average_across_batch=False)
            # [batch_size*n_samples, max_steps]
            self.predicted_inference_skips = tf.argmax(
                predicted_logits,
                axis=-1,
                name='predicted_inference_skips',
                output_type=tf.int32)
            self.correct_predicted_inference_skips = tf.cast(
                tf.equal(self.predicted_inference_skips, self.induced_skips),
                tf.float32)
            self.correct_predicted_inference_skips = tf.multiply(
                self.correct_predicted_inference_skips,
                valid,
                name='correct_predicted_inference_skips')

        self.v0 = skip_flag
        return ce_loss, rewards, n_skips, probs, valid, n_corrects, skip_flag, transfering_loss
Esempio n. 58
0
  def __init__(self, user_count, item_count, cate_count, cate_list):

    self.u = tf.placeholder(tf.int32, [None,]) # [B]
    self.i = tf.placeholder(tf.int32, [None,]) # [B], item feature list, dim: N*M
    self.j = tf.placeholder(tf.int32, [None,]) # [B]
    self.y = tf.placeholder(tf.float32, [None,]) # [B]
    self.hist_i = tf.placeholder(tf.int32, [None, None]) # [B, T]
    self.sl = tf.placeholder(tf.int32, [None,]) # [B]
    self.lr = tf.placeholder(tf.float64, []) # learning rate

    hidden_units = 128

    user_emb_w = tf.get_variable("user_emb_w", [user_count, hidden_units])
    item_emb_w = tf.get_variable("item_emb_w", [item_count, hidden_units // 2])
    item_b = tf.get_variable("item_b", [item_count],
                             initializer=tf.constant_initializer(0.0))
    cate_emb_w = tf.get_variable("cate_emb_w", [cate_count, hidden_units // 2])
    cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64)

    u_emb = tf.nn.embedding_lookup(user_emb_w, self.u)

    ic = tf.gather(cate_list, self.i)
    i_emb = tf.concat(values = [
        tf.nn.embedding_lookup(item_emb_w, self.i),
        tf.nn.embedding_lookup(cate_emb_w, ic),
        ], axis=1)
    i_b = tf.gather(item_b, self.i)

    jc = tf.gather(cate_list, self.j)
    j_emb = tf.concat([
        tf.nn.embedding_lookup(item_emb_w, self.j),
        tf.nn.embedding_lookup(cate_emb_w, jc),
        ], axis=1)
    j_b = tf.gather(item_b, self.j)

    hc = tf.gather(cate_list, self.hist_i)
    h_emb = tf.concat([
        tf.nn.embedding_lookup(item_emb_w, self.hist_i),
        tf.nn.embedding_lookup(cate_emb_w, hc),
        ], axis=2)

    #-- sum begin -------
    mask = tf.sequence_mask(self.sl, tf.shape(h_emb)[1], dtype=tf.float32) # [B, T]
    mask = tf.expand_dims(mask, -1) # [B, T, 1]
    mask = tf.tile(mask, [1, 1, tf.shape(h_emb)[2]]) # [B, T, H]
    h_emb *= mask # [B, T, H]
    hist = h_emb
    hist = tf.reduce_sum(hist, 1) 
    hist = tf.div(hist, tf.cast(tf.tile(tf.expand_dims(self.sl,1), [1,128]), tf.float32))
    print(h_emb.get_shape().as_list())
    #-- sum end ---------
    
    hist = tf.layers.batch_normalization(inputs = hist)
    hist = tf.reshape(hist, [-1, hidden_units])
    hist = tf.layers.dense(hist, hidden_units)

    u_emb = hist
    #-- fcn begin -------
    din_i = tf.concat([u_emb, i_emb], axis=-1)
    din_i = tf.layers.batch_normalization(inputs=din_i, name='b1')
    d_layer_1_i = tf.layers.dense(din_i, 80, activation=tf.nn.sigmoid, name='f1')
    d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=tf.nn.sigmoid, name='f2')
    d_layer_3_i = tf.layers.dense(d_layer_2_i, 1, activation=None, name='f3')
    din_j = tf.concat([u_emb, j_emb], axis=-1)
    din_j = tf.layers.batch_normalization(inputs=din_j, name='b1', reuse=True)
    d_layer_1_j = tf.layers.dense(din_j, 80, activation=tf.nn.sigmoid, name='f1', reuse=True)
    d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=tf.nn.sigmoid, name='f2', reuse=True)
    d_layer_3_j = tf.layers.dense(d_layer_2_j, 1, activation=None, name='f3', reuse=True)
    d_layer_3_i = tf.reshape(d_layer_3_i, [-1])
    d_layer_3_j = tf.reshape(d_layer_3_j, [-1])
    x = i_b - j_b + d_layer_3_i - d_layer_3_j # [B]
    self.logits = i_b + d_layer_3_i
    u_emb_all = tf.expand_dims(u_emb, 1)
    u_emb_all = tf.tile(u_emb_all, [1, item_count, 1])
    # logits for all item:
    all_emb = tf.concat([
        item_emb_w,
        tf.nn.embedding_lookup(cate_emb_w, cate_list)
        ], axis=1)
    all_emb = tf.expand_dims(all_emb, 0)
    all_emb = tf.tile(all_emb, [512, 1, 1])
    din_all = tf.concat([u_emb_all, all_emb], axis=-1)
    din_all = tf.layers.batch_normalization(inputs=din_all, name='b1', reuse=True)
    d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1', reuse=True)
    d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2', reuse=True)
    d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3', reuse=True)
    d_layer_3_all = tf.reshape(d_layer_3_all, [-1, item_count])
    self.logits_all = tf.sigmoid(item_b + d_layer_3_all)
    #-- fcn end -------

    
    self.mf_auc = tf.reduce_mean(tf.to_float(x > 0))
    self.score_i = tf.sigmoid(i_b + d_layer_3_i)
    self.score_j = tf.sigmoid(j_b + d_layer_3_j)
    self.score_i = tf.reshape(self.score_i, [-1, 1])
    self.score_j = tf.reshape(self.score_j, [-1, 1])
    self.p_and_n = tf.concat([self.score_i, self.score_j], axis=-1)
    print(self.p_and_n.get_shape().as_list())


    # Step variable
    self.global_step = tf.Variable(0, trainable=False, name='global_step')
    self.global_epoch_step = \
        tf.Variable(0, trainable=False, name='global_epoch_step')
    self.global_epoch_step_op = \
        tf.assign(self.global_epoch_step, self.global_epoch_step+1)

    self.loss = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=self.logits,
            labels=self.y)
        )

    trainable_params = tf.trainable_variables()
    self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr)
    gradients = tf.gradients(self.loss, trainable_params)
    clip_gradients, _ = tf.clip_by_global_norm(gradients, 5)
    self.train_op = self.opt.apply_gradients(
        zip(clip_gradients, trainable_params), global_step=self.global_step)
Esempio n. 59
0
    def __call__(self, decoder_inp, seq_len,
                 encoder_hidden_states, seq_len_inp):
        # First prepare the decoder input - Embed the input and obtain the
        # relevant loop function
        params = self.params

        scope = "rnn_decoder" + ("" if self.scope is None else "_" + self.scope)

        with tf.variable_scope(scope):
            decoder_inputs, loop_function = self.prepare_decoder_input(decoder_inp)
            lm_cell = self.get_cell(hidden_size=params.lm_hidden_size)

        # TensorArray is used to do dynamic looping over decoder input
        inputs_ta = tf.TensorArray(size=params.max_output,
                                   dtype=tf.float32)
        inputs_ta = inputs_ta.unstack(decoder_inputs)

        batch_size = tf.shape(decoder_inputs)[1]
        attn_length = tf.shape(encoder_hidden_states)[1]
        emb_size = decoder_inputs.get_shape()[2].value
        attn_size = encoder_hidden_states.get_shape()[2].value

        # Attention variables
        attn_mask = tf.sequence_mask(tf.cast(seq_len_inp, tf.int32), dtype=tf.float32)

        batch_attn_size = tf.stack([batch_size, attn_size])
        attn = tf.zeros(batch_attn_size, dtype=tf.float32)
        batch_alpha_size = tf.stack([batch_size, attn_length, 1, 1])
        alpha = tf.zeros(batch_alpha_size, dtype=tf.float32)


        with tf.variable_scope(scope):
            # Calculate the W*h_enc component
            hidden = tf.expand_dims(encoder_hidden_states, 2)
            W_attn = tf.get_variable(
                "AttnW", [1, 1, attn_size, params.attention_vec_size])
            hidden_features = tf.nn.conv2d(hidden, W_attn, [1, 1, 1, 1], "SAME")
            v = tf.get_variable("AttnV", [params.attention_vec_size])

            def raw_loop_function(time, cell_output, state, loop_state):
                def attention(query, prev_alpha):
                    """Put attention masks on hidden using hidden_features and query."""
                    with tf.variable_scope("Attention"):
                        y = _linear(query, params.attention_vec_size, True)
                        y = tf.reshape(y, [-1, 1, 1, params.attention_vec_size])
                        s = tf.reduce_sum(
                            v * tf.tanh(hidden_features + y), [2, 3])

                        alpha = tf.nn.softmax(s) * attn_mask
                        sum_vec = tf.reduce_sum(alpha, reduction_indices=[1], keepdims=True)
                        norm_term = tf.tile(sum_vec, tf.stack([1, tf.shape(alpha)[1]]))
                        alpha = alpha / norm_term

                        alpha = tf.expand_dims(alpha, 2)
                        alpha = tf.expand_dims(alpha, 3)
                        context_vec = tf.reduce_sum(alpha * hidden, [1, 2])
                    return tuple([context_vec, alpha])

                # If loop_function is set, we use it instead of decoder_inputs.
                elements_finished = (time >= tf.cast(seq_len, tf.int32))
                finished = tf.reduce_all(elements_finished)


                if cell_output is None:
                    next_state = self.cell.zero_state(batch_size, dtype=tf.float32)

                    # This output is not used but is just used to tell the shape
                    # without the batch dimension
                    # Check here - https://www.tensorflow.org/api_docs/python/tf/nn/raw_rnn
                    output = tf.zeros((self.params.vocab_size))
                    lm_input = inputs_ta.read(time)
                    attn_state = tuple([attn, alpha])
                    lm_state = lm_cell.zero_state(batch_size, dtype=tf.float32)
                else:
                    next_state = state
                    #loop_state = attention(cell_output, loop_state[1])
                    lm_state, attn_state = loop_state
                    attn_state = attention(self.get_state(state), attn_state[1])

                    with tf.variable_scope("AttnProjection"):
                        proj_output = _linear([self.get_state(state), attn_state[0]],
                                              self.params.hidden_size_dec, True)
                    if params.ind_softmax:
                        # Don't share parameters with LM model
                        with tf.variable_scope("OutputProjection2"):
                            output = _linear([proj_output], self.params.vocab_size, True)
                    else:
                        with tf.variable_scope("OutputProjection"):
                            output = _linear([proj_output], self.params.vocab_size, True)


                    if not self.isTraining:
                        lm_input = loop_function(output)
                    else:
                        if loop_function is not None:
                            random_prob = tf.random_uniform([])
                            lm_input = tf.cond(
                                finished,
                                lambda: tf.zeros([batch_size, emb_size], dtype=tf.float32),
                                lambda: tf.cond(tf.less(random_prob, 1 - params.samp_prob),
                                                lambda: inputs_ta.read(time),
                                                lambda: loop_function(output))
                            )
                        else:
                            lm_input = tf.cond(
                                finished,
                                lambda: tf.zeros([batch_size, emb_size], dtype=tf.float32),
                                lambda: inputs_ta.read(time)
                            )

                # Common calculations
                lm_output, next_lm_state = lm_cell(lm_input, lm_state)
                if params.lm_hidden_size != params.hidden_size_dec:
                    with tf.variable_scope("SimpleProjection", reuse=tf.AUTO_REUSE):
                        lm_output = _linear([lm_output], params.hidden_size_dec, True)

                # Merge input and previous attentions into one vector of the right size.
                input_size = lm_input.get_shape().with_rank(2)[1]
                if input_size.value is None:
                    raise ValueError("Could not infer input size from input")
                with tf.variable_scope("InputProjection", reuse=tf.AUTO_REUSE):
                    next_input = _linear([lm_output, attn_state[0]], input_size, True)

                loop_state = tuple([next_lm_state, attn_state])

                return (elements_finished, next_input, next_state, output, loop_state)

            # outputs is a TensorArray with T=max(sequence_length) entries
            # of shape Bx|V|
            outputs, state, _ = tf.nn.raw_rnn(self.cell, raw_loop_function)

        # Concatenate the output across timesteps to get a tensor of TxBx|V|
        # shape
        outputs = outputs.concat()

        return outputs
Esempio n. 60
0
def din_fcn_attention(query,
                      rnn_output,
                      keys_len,
                      scope_name,
                      stag='null',
                      mode='SUM',
                      softmax_stag=1,
                      time_major=False,
                      return_alphas=False,
                      for_cnn=False):
    if isinstance(rnn_output, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        rnn_output = tf.concat(rnn_output, 2)
    if len(rnn_output.get_shape().as_list()) == 2:
        rnn_output = tf.expand_dims(rnn_output, 1)
    if time_major:
        # (T,B,D) => (B,T,D)
        rnn_output = array_ops.transpose(rnn_output, [1, 0, 2])

    # Trainable parameters
    # mask = tf.equal(mask, tf.ones_like(mask))

    # query_size = query.get_shape().as_list()[-1]
    rnn_output_size = rnn_output.get_shape().as_list()[
        -1]  # D value - hidden size of the RNN layer
    query = tf.layers.dense(query,
                            rnn_output_size,
                            activation=None,
                            name=scope_name + '_f1' + stag)
    query = prelu(query, scope=scope_name)
    queries = tf.tile(query, [1, tf.shape(rnn_output)[1]])
    queries = tf.reshape(queries, tf.shape(rnn_output))
    din_all = tf.concat(
        [queries, rnn_output, queries - rnn_output, queries * rnn_output],
        axis=-1)
    d_layer_1_all = tf.layers.dense(din_all,
                                    80,
                                    activation=tf.nn.sigmoid,
                                    name=scope_name + 'f1_att' + stag)
    d_layer_2_all = tf.layers.dense(d_layer_1_all,
                                    40,
                                    activation=tf.nn.sigmoid,
                                    name=scope_name + 'f2_att' + stag)
    d_layer_3_all = tf.layers.dense(d_layer_2_all,
                                    1,
                                    activation=None,
                                    name=scope_name + 'f3_att' + stag)
    d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(rnn_output)[1]])
    scores = d_layer_3_all
    # Mask
    key_masks = tf.sequence_mask(keys_len, tf.shape(rnn_output)[1])  # [B, T]
    key_masks = tf.expand_dims(key_masks, 1)  # [B, 1, T]
    paddings = tf.ones_like(scores) * (-2**32 + 1)
    if not for_cnn:
        scores = tf.where(key_masks, scores, paddings)  # [B, 1, T]

    # Scale
    # scores = scores / (facts.get_shape().as_list()[-1] ** 0.5)

    # Activation
    if softmax_stag:
        scores = tf.nn.softmax(scores)  # [B, 1, T]

    # Weighted sum
    if mode == 'SUM':
        output = tf.matmul(scores, rnn_output)  # [B, 1, H]
        # output = tf.reshape(output, [-1, tf.shape(facts)[-1]])
    else:
        scores = tf.reshape(scores, [-1, tf.shape(rnn_output)[1]])
        output = rnn_output * tf.expand_dims(scores, -1)
        output = tf.reshape(output, tf.shape(rnn_output))
    if return_alphas:
        return output, scores
    return output