def _testDynamicDecodeRNN(self,
                              time_major,
                              has_attention,
                              with_alignment_history=False):
        encoder_sequence_length = np.array([3, 2, 3, 1, 1])
        decoder_sequence_length = np.array([2, 0, 1, 2, 3])
        batch_size = 5
        decoder_max_time = 4
        input_depth = 7
        cell_depth = 9
        attention_depth = 6
        vocab_size = 20
        end_token = vocab_size - 1
        start_token = 0
        embedding_dim = 50
        max_out = max(decoder_sequence_length)
        output_layer = layers_core.Dense(vocab_size,
                                         use_bias=True,
                                         activation=None)
        beam_width = 3

        with self.cached_session() as sess:
            batch_size_tensor = constant_op.constant(batch_size)
            embedding = np.random.randn(vocab_size,
                                        embedding_dim).astype(np.float32)
            cell = rnn_cell.LSTMCell(cell_depth)
            initial_state = cell.zero_state(batch_size, dtypes.float32)
            coverage_penalty_weight = 0.0
            if has_attention:
                coverage_penalty_weight = 0.2
                inputs = array_ops.placeholder_with_default(
                    np.random.randn(batch_size, decoder_max_time,
                                    input_depth).astype(np.float32),
                    shape=(None, None, input_depth))
                tiled_inputs = beam_search_decoder.tile_batch(
                    inputs, multiplier=beam_width)
                tiled_sequence_length = beam_search_decoder.tile_batch(
                    encoder_sequence_length, multiplier=beam_width)
                attention_mechanism = attention_wrapper.BahdanauAttention(
                    num_units=attention_depth,
                    memory=tiled_inputs,
                    memory_sequence_length=tiled_sequence_length)
                initial_state = beam_search_decoder.tile_batch(
                    initial_state, multiplier=beam_width)
                cell = attention_wrapper.AttentionWrapper(
                    cell=cell,
                    attention_mechanism=attention_mechanism,
                    attention_layer_size=attention_depth,
                    alignment_history=with_alignment_history)
            cell_state = cell.zero_state(dtype=dtypes.float32,
                                         batch_size=batch_size_tensor *
                                         beam_width)
            if has_attention:
                cell_state = cell_state.clone(cell_state=initial_state)
            bsd = beam_search_decoder.BeamSearchDecoder(
                cell=cell,
                embedding=embedding,
                start_tokens=array_ops.fill([batch_size_tensor], start_token),
                end_token=end_token,
                initial_state=cell_state,
                beam_width=beam_width,
                output_layer=output_layer,
                length_penalty_weight=0.0,
                coverage_penalty_weight=coverage_penalty_weight)

            final_outputs, final_state, final_sequence_lengths = (
                decoder.dynamic_decode(bsd,
                                       output_time_major=time_major,
                                       maximum_iterations=max_out))

            def _t(shape):
                if time_major:
                    return (shape[1], shape[0]) + shape[2:]
                return shape

            self.assertIsInstance(
                final_outputs,
                beam_search_decoder.FinalBeamSearchDecoderOutput)
            self.assertIsInstance(final_state,
                                  beam_search_decoder.BeamSearchDecoderState)

            beam_search_decoder_output = final_outputs.beam_search_decoder_output
            self.assertEqual(
                _t((batch_size, None, beam_width)),
                tuple(beam_search_decoder_output.scores.get_shape().as_list()))
            self.assertEqual(
                _t((batch_size, None, beam_width)),
                tuple(final_outputs.predicted_ids.get_shape().as_list()))

            sess.run(variables.global_variables_initializer())
            sess_results = sess.run({
                'final_outputs':
                final_outputs,
                'final_state':
                final_state,
                'final_sequence_lengths':
                final_sequence_lengths
            })

            max_sequence_length = np.max(
                sess_results['final_sequence_lengths'])

            # A smoke test
            self.assertEqual(
                _t((batch_size, max_sequence_length, beam_width)),
                sess_results['final_outputs'].beam_search_decoder_output.
                scores.shape)
            self.assertEqual(
                _t((batch_size, max_sequence_length, beam_width)),
                sess_results['final_outputs'].beam_search_decoder_output.
                predicted_ids.shape)
Example #2
0
 def __init__(self):
   super(Dense, self).__init__()
   self.first = self.track_layer(core.Dense(1, use_bias=False))
    def __init__(self, training, tokenized_data, batch_input, scope=None):
        """
        Create the model.

        Args:
            training: A boolean value to indicate whether this model will be used for training.
            tokenized_data: The data object containing all information required for the model.
            scope: scope of the model.
        """
        self.training = training
        self.batch_input = batch_input
        self.vocab_table = tokenized_data.vocab_table
        self.vocab_size = tokenized_data.vocab_size
        self.reverse_vocab_table = tokenized_data.reverse_vocab_table

        hparams = tokenized_data.hparams
        self.hparams = hparams

        self.num_layers = hparams.num_layers
        self.num_gpus = hparams.num_gpus
        self.time_major = hparams.time_major

        # Initializer
        initializer = model_helper.get_initializer(hparams.init_op,
                                                   hparams.random_seed,
                                                   hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        self.embedding = (model_helper.create_embbeding(
            vocab_size=self.vocab_size,
            embed_size=hparams.num_units,
            scope=scope))
        # This batch_size might vary among each batch instance due to the bucketing and/or reach
        # the end of the training set. Treat it as size_of_the_batch.
        self.batch_size = tf.size(self.batch_input.source_sequence_length)

        # Projection
        with tf.variable_scope(scope or "build_network"):
            with tf.variable_scope("decoder/output_projection"):
                self.output_layer = layers_core.Dense(self.vocab_size,
                                                      use_bias=False,
                                                      name="output_projection")

        # Training or inference graph
        print("# Building graph for the model ...")
        res = self.build_graph(hparams, scope=scope)

        if training:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(self.batch_input.source_sequence_length) + \
                              tf.reduce_sum(self.batch_input.target_sequence_length)
            # Count the number of predicted words for compute perplexity.
            self.predict_count = tf.reduce_sum(
                self.batch_input.target_sequence_length)
        else:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = self.reverse_vocab_table.lookup(
                tf.to_int64(self.sample_id))

        self.global_step = tf.Variable(0, trainable=False)

        params = tf.trainable_variables()

        # Gradients update operation for training the model.
        if training:
            self.learning_rate = tf.placeholder(tf.float32,
                                                shape=[],
                                                name='learning_rate')
            opt = tf.train.AdamOptimizer(self.learning_rate)

            gradients = tf.gradients(self.train_loss,
                                     params,
                                     colocate_gradients_with_ops=hparams.
                                     colocate_gradients_with_ops)

            clipped_gradients, gradient_norm_summary = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)

            self.update = opt.apply_gradients(zip(clipped_gradients, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = tf.summary.merge([
                tf.summary.scalar("learning_rate", self.learning_rate),
                tf.summary.scalar("train_loss", self.train_loss),
            ] + gradient_norm_summary)

        if not training:
            self.infer_summary = tf.no_op()

        # Saver
        self.saver = tf.train.Saver(tf.global_variables())

        # Print trainable variables
        if training:
            print("# Trainable variables:")
            for param in params:
                print("  {}, {}, {}".format(param.name, str(param.get_shape()),
                                            param.op.device))
Example #4
0
 def __init__(self, name=None):
   super(Parent, self).__init__(name=name)
   self.first = self.track_layer(first)
   self.second = self.track_layer(core.Dense(1, use_bias=False))
Example #5
0
 def __init__(self, name=None):
   super(Compatible, self).__init__(name=name)
   self.first = self.track_layer(core.Dense(1, use_bias=False))
Example #6
0
 def __init__(self):
   super(FirstNetwork, self).__init__()
   self.first = self.track_layer(shared_layer)
   self.second = self.track_layer(core.Dense(1, use_bias=False))
Example #7
0
 def __init__(self, use_layer, name=None):
   super(User, self).__init__(name=name)
   self.first = self.track_layer(use_layer)
   self.second = self.track_layer(core.Dense(
       1, name="second_layer", use_bias=False))
Example #8
0
 def __init__(self, rnn_cell, num_dims, num_hidden):
     self._num_dims = num_dims
     self._rnn_cell = rnn_cell
     self._fc_layer = tf_layers_core.Dense(units=num_dims + num_hidden)
     self._nade = Nade(num_dims, num_hidden)
Example #9
0
def lstm_decoder_embedding(H, y, W_emb, opt, prefix = '', add_go = False, feed_previous=False, is_reuse= None, is_fed_h = True, is_sampling = False, is_softargmax = False, beam_width=None):
    #y  len* batch * [0,V]   H batch * h
    biasInit = tf.constant_initializer(0.001, dtype=tf.float32)
    #y = [tf.squeeze(y[:,i]) for i in xrange(y.get_shape()[1])]
    if add_go:
        y = tf.concat([tf.ones([opt.batch_size,1],dtype=tf.int32), y],1)

    y = tf.unstack(y, axis=1)  # 1, . , .
    # make the size of hidden unit to be n_hid
    if not opt.additive_noise_lambda:
        H = layers.fully_connected(H, num_outputs = opt.n_hid, biases_initializer=biasInit, activation_fn = None, scope = prefix + 'lstm_decoder', reuse = is_reuse)
    H0 = tf.squeeze(H)
    H1 = (H0, tf.zeros_like(H0))  # initialize H and C #

    y_input = [tf.concat([tf.nn.embedding_lookup(W_emb, features),H0],1) for features in y] if is_fed_h   \
               else [tf.nn.embedding_lookup(W_emb, features) for features in y]
    with tf.variable_scope(prefix + 'lstm_decoder', reuse=True):
        cell = tf.contrib.rnn.LSTMCell(opt.n_hid)
    with tf.variable_scope(prefix + 'lstm_decoder', reuse=is_reuse):
        weightInit = tf.random_uniform_initializer(-0.001, 0.001)
        W = tf.get_variable('W', [opt.n_hid, opt.embed_size], initializer = weightInit)
        b = tf.get_variable('b', [opt.n_words], initializer = tf.random_uniform_initializer(-0.001, 0.001))
        W_new = tf.matmul(W, W_emb, transpose_b=True) # h* V

        out_proj = (W_new,b) if feed_previous else None
        decoder_res = rnn_decoder_custom_embedding(emb_inp = y_input, initial_state = H1, cell = cell, embedding = W_emb, opt = opt, feed_previous = feed_previous, output_projection=out_proj, num_symbols = opt.n_words, is_fed_h = is_fed_h, is_softargmax = is_softargmax, is_sampling = is_sampling)
        outputs = decoder_res[0]

        if beam_width:
            #cell = rnn_cell.LSTMCell(cell_depth)
            #batch_size_tensor = constant_op.constant(opt.batch_size)
            initial_state = cell.zero_state(opt.batch_size* beam_width, tf.float32) #beam_search_decoder.tile_batch(H0, multiplier=beam_width)
            output_layer = layers_core.Dense(opt.n_words, use_bias=True, kernel_initializer = W_new, bias_initializer = b, activation=None)
            bsd = beam_search_decoder.BeamSearchDecoder(
                cell=cell,
                embedding=W_emb,
                start_tokens=array_ops.fill([opt.batch_size], dp.GO_ID), # go is 1
                end_token=dp.EOS_ID,
                initial_state=initial_state,
                beam_width=beam_width,
                output_layer=output_layer,
                length_penalty_weight=0.0)
            #pdb.set_trace()
            final_outputs, final_state, final_sequence_lengths = (
                decoder.dynamic_decode(bsd, output_time_major=False, maximum_iterations=opt.maxlen))
            beam_search_decoder_output = final_outputs.beam_search_decoder_output
            #print beam_search_decoder_output.get_shape()

    logits = [nn_ops.xw_plus_b(out, W_new, b) for out in outputs]  # hidden units to prob logits: out B*h  W: h*E  Wemb V*E
    if is_sampling:
        syn_sents = decoder_res[2]
        loss = sequence_loss(logits[:-1], syn_sents, [tf.cast(tf.ones_like(yy),tf.float32) for yy in syn_sents])
        #loss = sequence_loss(logits[:-1], syn_sents, [tf.cast(tf.not_equal(yy,dp.PAD_ID),tf.float32) for yy in syn_sents])
        #loss = sequence_loss(logits[:-1], syn_sents, [tf.concat([tf.ones([1]), tf.cast(tf.not_equal(yy,dp.PAD_ID),tf.float32)],0) for yy in syn_sents[:-1]]) # use one more pad after EOS
        syn_sents = tf.stack(syn_sents,1)
    else:
        syn_sents = [math_ops.argmax(l, 1) for l in logits]
        syn_sents = tf.stack(syn_sents,1)
        loss = sequence_loss(logits[:-1], y[1:], [tf.cast(tf.ones_like(yy),tf.float32) for yy in y[1:]])
        #loss = sequence_loss(logits[:-1], y[1:], [tf.cast(tf.not_equal(yy,dp.PAD_ID),tf.float32) for yy in y[:-1]]) # use one more pad after EOS

    #outputs, _ = embedding_rnn_decoder(decoder_inputs = y, initial_state = H, cell = tf.contrib.rnn.BasicLSTMCell, num_symbols = opt.n_words, embedding_size = opt.embed_size, scope = prefix + 'lstm_decoder')

    # outputs : batch * len



    return loss, syn_sents, logits
Example #10
0
def customized_slim_fully_connected(
        inputs,
        num_outputs,
        activation_fn=nn.relu,
        normalizer_fn=None,
        normalizer_params=None,
        weights_initializer=initializers.xavier_initializer(),
        weights_regularizer=None,
        biases_initializer=init_ops.zeros_initializer(),
        biases_regularizer=None,
        reuse=None,
        variables_collections=None,
        outputs_collections=None,
        trainable=True,
        scope=None,
        task_id=1):
    """Adds a sparse fully connected layer. The weight matrix is masked.

  `fully_connected` creates a variable called `weights`, representing a fully
  connected weight matrix, which is multiplied by the `inputs` to produce a
  `Tensor` of hidden units. If a `normalizer_fn` is provided (such as
  `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is
  None and a `biases_initializer` is provided then a `biases` variable would be
  created and added the hidden units. Finally, if `activation_fn` is not `None`,
  it is applied to the hidden units as well.

  Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened
  prior to the initial matrix multiply by `weights`.

  Args:
    inputs: A tensor of at least rank 2 and static value for the last dimension;
      i.e. `[batch_size, depth]`, `[None, None, None, channels]`.
    num_outputs: Integer or long, the number of output units in the layer.
    activation_fn: Activation function. The default value is a ReLU function.
      Explicitly set it to None to skip it and maintain a linear activation.
    normalizer_fn: Normalization function to use instead of `biases`. If
      `normalizer_fn` is provided then `biases_initializer` and
      `biases_regularizer` are ignored and `biases` are not created nor added.
      default set to None for no normalizer function
    normalizer_params: Normalization function parameters.
    weights_initializer: An initializer for the weights.
    weights_regularizer: Optional regularizer for the weights.
    biases_initializer: An initializer for the biases. If None skip biases.
    biases_regularizer: Optional regularizer for the biases.
    reuse: Whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.
    variables_collections: Optional list of collections for all the variables or
      a dictionary containing a different list of collections per variable.
    outputs_collections: Collection to add the outputs.
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    scope: Optional scope for variable_scope.

  Returns:
     The tensor variable representing the result of the series of operations.

  Raises:
    ValueError: If x has rank less than 2 or if its last dimension is not set.
  """
    if not isinstance(num_outputs, six.integer_types):
        raise ValueError('num_outputs should be int or long, got %s.' %
                         (num_outputs, ))

    layer_variable_getter = _build_variable_getter({
        'bias': 'biases',
        'kernel': 'weights'
    })

    with variable_scope.variable_scope(
            scope,
            'FC', [inputs],
            reuse=reuse,
            custom_getter=layer_variable_getter) as sc:
        inputs = ops.convert_to_tensor(inputs)
        layer = core_layers.Dense(units=num_outputs,
                                  activation=None,
                                  use_bias=not normalizer_fn
                                  and biases_initializer,
                                  kernel_initializer=weights_initializer,
                                  bias_initializer=biases_initializer,
                                  kernel_regularizer=weights_regularizer,
                                  bias_regularizer=biases_regularizer,
                                  activity_regularizer=None,
                                  trainable=trainable,
                                  name=sc.name,
                                  dtype=inputs.dtype.base_dtype,
                                  _scope=sc,
                                  _reuse=reuse)
        outputs = layer.apply(inputs)

        # Add variables to collections.
        _add_variable_to_collections(layer.kernel, variables_collections,
                                     'weights')
        if layer.bias is not None:
            _add_variable_to_collections(layer.bias, variables_collections,
                                         'biases')

        # Apply normalizer function / layer.
        if normalizer_fn is not None:
            if not normalizer_params:
                normalizer_params = {}
            with tf.variable_scope('task_{}'.format(
                    task_id)):  # Because there are multi-task problems
                outputs = normalizer_fn(outputs, **normalizer_params)

            # outputs = normalizer_fn(outputs, **normalizer_params)

        if activation_fn is not None:
            outputs = activation_fn(outputs)

        return utils.collect_named_outputs(outputs_collections,
                                           sc.original_name_scope, outputs)
Example #11
0
    def __init__(self, hparams, mode):
        self.vocab_size = hparams.to_vocab_size
        self.emb_dim = hparams.emb_dim
        self.num_units = hparams.units
        self.num_layers = hparams.num_layers
        self.learning_rate = tf.Variable(float(hparams.learning_rate),
                                         trainable=False)
        self.clip_value = hparams.clip_value
        self.max_seq_length = 50

        if mode != tf.contrib.learn.ModeKeys.INFER:
            self.decoder_input_ids = tf.placeholder(dtype=tf.int32,
                                                    shape=[None, None])
            self.decoder_input_length = tf.placeholder(dtype=tf.int32,
                                                       shape=[None])
            self.initial_state = tf.placeholder(dtype=tf.float32,
                                                shape=[None, None, None])
            self.batch_size = tf.size(self.decoder_input_length)
        else:
            self.batch_size = 1

        with tf.variable_scope("embedding") as scope:
            self.embeddings = tf.Variable(
                self.init_matrix([self.vocab_size, self.emb_dim]))

        with tf.variable_scope("projection") as scope:
            self.output_layer = layers_core.Dense(self.vocab_size)

        with tf.variable_scope("decoder") as scope:
            if self.num_layers > 1:
                decoder_cell = tf.contrib.rnn.MultiRNNCell([
                    tf.contrib.rnn.BasicLSTMCell(self.num_units)
                    for _ in range(self.num_layers)
                ])
            else:
                decoder_cell = tf.contrib.rnn.BasicLSTMCell(self.num_units)
            if mode != tf.contrib.learn.ModeKeys.INFER:
                initial_state = self.initial_state
                with tf.device("/cpu:0"):
                    decoder_inputs = tf.nn.embedding_lookup(
                        self.embeddings, self.decoder_input_ids)
                helper = tf.contrib.seq2seq.TrainingHelper(
                    decoder_inputs, self.decoder_input_length)
                my_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=decoder_cell,
                    helper=helper,
                    initial_state=initial_state,
                    output_layer=self.output_layer)
                decoder_outputs, decoder_state, decoder_output_len = tf.contrib.seq2seq.dynamic_decode(
                    my_decoder,
                    maximum_iterations=self.max_seq_length * 2,
                    swap_memory=True,
                )
                self.sample_id = decoder_outputs.sample_id
                self.logits = decoder_outputs.rnn_output
            else:
                helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                    self.embeddings, [hparams.GO_ID], hparams.EOS_ID)
                initial_state = self.initial_state
                my_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=decoder_cell,
                    helper=helper,
                    initial_state=initial_state,
                    output_layer=self.output_layer)
                decoder_outputs, decoder_state, decoder_output_len = tf.contrib.seq2seq.dynamic_decode(
                    my_decoder,
                    maximum_iterations=self.max_seq_length * 2,
                    swap_memory=True)
                self.sample_id = tf.unstack(decoder_outputs.sample_id, axis=0)

        with tf.variable_scope("rollout") as scope:
            self.given_decoder_inputs_ids = tf.placeholder(dtype=tf.int32,
                                                           shape=[None, None])
            self.given_decoder_length = tf.placeholder(dtype=tf.int32,
                                                       shape=[None, None])
            self.given_next_ids = tf.placeholder(dtype=tf.int32, shape=[None])
            initial_state = self.initial_state
            with tf.device("/cpu:0"):
                given_decoder_inputs = tf.nn.embedding_lookup(
                    self.embeddings, self.given_decoder_input_ids)
            helper1 = tf.contrib.seq2seq.TrainingHelper(
                decoder_inputs, self.decoder_input_length)
            my_decoder1 = tf.contrib.seq2seq.BasicDecoder(
                cell=decoder_cell,
                helper=helper,
                initial_state=initial_state,
                output_layer=self.output_layer)
            decoder1_outputs, decoder1_state, decoder1_output_len = tf.contrib.seq2seq.dynamic_decode(
                my_decoder1,
                maximum_iterations=self.max_seq_length * 2,
                swap_memory=True)

            helper2 = tf.contrib.seq2seq.SampleEmbeddingHelper(
                self.embeddings, self.given_next_ids, hparams.EOS_ID)
            my_decoder2 = tf.contrib.seq2seq.BasicDecoder(
                cell=decoder_cell,
                helper=helper,
                initial_state=decoder1_state,
                output_layer=self.output_layer)
            decoder2_outputs, decoder2_state, decoder2_output_len = tf.contrib.seq2seq.dynamic_decode(
                my_decoder2,
                maximum_iterations=self.max_seq_length * 2,
                swap_memory=True)

        if mode != tf.contrib.learn.ModeKeys.INFER:
            self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None])
            self.target_weights = tf.placeholder(dtype=tf.float32,
                                                 shape=[None, None])
            self.rewards = tf.placeholder(dtype=tf.float32, shape=[None, None])
            with tf.variable_scope("loss") as scope:
                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.targets, logits=self.logits)
                self.pretrain_loss = tf.reduce_sum(
                    crossent * self.target_weights) / tf.to_float(
                        self.batch_size)
                self.train_loss = tf.reduce_sum(
                    crossent * self.rewards) / tf.to_float(self.batch_size)

            if mode == tf.contrib.learn.ModeKeys.TRAIN:
                self.pretrain_global_step = tf.Variable(0, trainable=False)
                self.train_global_step = tf.Variable(0, trainable=False)
                with tf.variable_scope("train_op") as scope:
                    optimizer = tf.train.AdamOptimizer(self.learning_rate)

                    pretrain_gradients, pretrain_v = zip(
                        *optimizer.compute_gradients(self.pretrain_loss))
                    pretrain_gradients, _ = tf.clip_by_global_norm(
                        pretrain_gradients, self.clip_value)
                    self.pretrain_train_op = optimizer.apply_gradients(
                        zip(pretrain_gradients, pretrain_v),
                        global_step=self.pretrain_global_step)

                    train_gradients, train_v = zip(
                        *optimizer.compute_gradients(self.train_loss))
                    train_gradients, _ = tf.clip_by_global_norm(
                        train_gradients, self.clip_value)
                    self.train_op = optimizer.apply_gradients(
                        zip(train_gradients, train_v),
                        global_step=self.train_global_step)

        self.saver = tf.train.Saver(tf.global_variables())
 def _build_projection(self):
     with tf.variable_scope("decoder/output_projection"):
         self.output_layer = layers_core.Dense(Config.data.vocab_size,
                                               use_bias=False,
                                               name="output_projection")
Example #13
0
  def __init__(self,
               hparams,
               mode,
               iterator,
               source_vocab_table,
               target_vocab_table,
               reverse_target_vocab_table=None,
               scope=None,
               extra_args=None):
    """Create the model.

    Args:
      hparams: Hyperparameter configurations.
      mode: TRAIN | EVAL | INFER
      iterator: Dataset Iterator that feeds data.
      source_vocab_table: Lookup table mapping source words to ids.
      target_vocab_table: Lookup table mapping target words to ids.
      reverse_target_vocab_table: Lookup table mapping ids to target words. Only
        required in INFER mode. Defaults to None.
      scope: scope of the model.
      extra_args: model_helper.ExtraArgs, for passing customizable functions.
    """
    self.supports_monolingual = False
    self.has_KL = False
    self.mode = mode
    self.src_vocab_table = source_vocab_table
    self.tgt_vocab_table = target_vocab_table

    self.src_vocab_size = hparams.src_vocab_size
    self.tgt_vocab_size = hparams.tgt_vocab_size
    self.num_gpus = hparams.num_gpus
    self.time_major = hparams.time_major

    # extra_args: to make it flexible for adding external customizable code
    self.single_cell_fn = None
    if extra_args:
      self.single_cell_fn = extra_args.single_cell_fn

    # Set num layers
    self.num_encoder_layers = hparams.num_encoder_layers
    self.num_decoder_layers = hparams.num_decoder_layers
    assert self.num_encoder_layers
    assert self.num_decoder_layers

    # Set num residual layers
    if hasattr(hparams, "num_residual_layers"):  # compatible common_test_utils
      self.num_encoder_residual_layers = hparams.num_residual_layers
      self.num_decoder_residual_layers = hparams.num_residual_layers
    else:
      self.num_encoder_residual_layers = hparams.num_encoder_residual_layers
      self.num_decoder_residual_layers = hparams.num_decoder_residual_layers

    # Initializer
    # initializer = model_helper.get_initializer(
    #     hparams.init_op, hparams.random_seed, hparams.init_weight)
    # tf.get_variable_scope().set_initializer(initializer)

    # Embeddings
    self.init_embeddings(hparams, scope)

    assert isinstance(iterator, iterator_utils.BatchedInput)
    self._parse_iterator(iterator, hparams, scope=scope)

    # Projection
    with tf.variable_scope(scope or "build_network"):
      with tf.variable_scope("decoder/output_projection"):
        self.output_layer = layers_core.Dense(
            hparams.tgt_vocab_size, use_bias=False, name="output_projection")

    ## Train graph
    res = self.build_graph(hparams, scope=scope)

    self._logits = res[0]
    if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
      self.train_loss = res[1]
      self.word_count = tf.reduce_sum(
          self.source_sequence_length) + tf.reduce_sum(
              self.target_sequence_length)
    elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
      self.eval_loss = res[1]
    elif self.mode == tf.contrib.learn.ModeKeys.INFER:
      self.infer_logits, _, self.final_context_state, self.sample_id = res
      self.sample_words = reverse_target_vocab_table.lookup(
          tf.to_int64(self.sample_id))

    if self.mode != tf.contrib.learn.ModeKeys.INFER:
      ## Count the number of predicted words for compute ppl.
      self.predict_count = tf.reduce_sum(
          self.target_sequence_length)

    self.global_step = tf.Variable(0, trainable=False)
    params = tf.trainable_variables()

    # Gradients and SGD update operation for training the model.
    # Arrage for the embedding vars to appear at the beginning.
    if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
      self.learning_rate = tf.constant(hparams.learning_rate)
      # warm-up
      self.learning_rate = self._get_learning_rate_warmup(hparams)
      # decay
      self.learning_rate = self._get_learning_rate_decay(hparams)

      # Optimizer
      if hparams.optimizer == "sgd":
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        tf.summary.scalar("lr", self.learning_rate)
      elif hparams.optimizer == "adam":
        opt = tf.train.AdamOptimizer(self.learning_rate)

      # Gradients
      gradients = tf.gradients(
          self.train_loss,
          params,
          colocate_gradients_with_ops=hparams.colocate_gradients_with_ops)

      clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
          gradients, max_gradient_norm=hparams.max_gradient_norm)
      self.grad_norm = grad_norm

      self.update = opt.apply_gradients(
          zip(clipped_grads, params), global_step=self.global_step)

      # Summary
      self._lr_summary = tf.summary.scalar("lr", self.learning_rate),
      self.train_summary = tf.summary.merge([
          self._lr_summary,
          tf.summary.scalar("train_loss", self.train_loss),
      ] + grad_norm_summary)
      self._grad_norm_summary = grad_norm_summary

    if self.mode == tf.contrib.learn.ModeKeys.INFER:
      self.infer_summary = self._get_infer_summary(hparams)

    # Saver
    self.saver = tf.train.Saver(
        tf.global_variables(), max_to_keep=hparams.num_keep_ckpts)

    # Print trainable variables
    utils.print_out("# Trainable variables")
    for param in params:
      utils.print_out("  %s, %s, %s" % (param.name, str(param.get_shape()),
                                        param.op.device))
Example #14
0
                                                         dtype=tf.float32)
del encoder_outputs

start_tokens = np.array([0]).repeat(BATCH_SIZE)
end_token = -1

decoder_embedding = tf.Variable(tf.truncated_normal(
    shape=[ONEHOT_SIZE, ONEHOT_SIZE], stddev=0.1),
                                name='decoder_embedding')
from tensorflow.python.layers import core
import tensorflow.contrib.layers as layers

output_layer = core.Dense(
    ONEHOT_SIZE,
    activation=tf.nn.relu,
    use_bias=True,
    name="output_projection",
    kernel_initializer=layers.variance_scaling_initializer(factor=1.0,
                                                           uniform=True,
                                                           seed=1))

decoder_cell = tf.contrib.rnn.LSTMCell(HIDDEN_SIZE)
helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embedding,
                                                  start_tokens, end_token)

decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,
                                          helper,
                                          encoder_final_state,
                                          output_layer=output_layer)

outputs, final_context_state = tf.contrib.seq2seq.dynamic_decode(
    decoder=decoder, maximum_iterations=TIMESTEPS_OUT, swap_memory=True)
Example #15
0
 def testNoEagerActivityRegularizer(self):
     with context.eager_mode():
         with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
             core_layers.Dense(
                 1, activity_regularizer=lambda *args, **kwargs: 0.)
Example #16
0
def masked_dense(inputs,
                 units,
                 num_blocks=None,
                 exclusive=False,
                 kernel_initializer=None,
                 reuse=None,
                 name=None,
                 *args,
                 **kwargs):
    """A autoregressively masked dense layer. Analogous to `tf.layers.dense`.

  See [1] for detailed explanation.

  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
       https://arxiv.org/abs/1502.03509

  Arguments:
    inputs: Tensor input.
    units: Python `int` scalar representing the dimensionality of the output
      space.
    num_blocks: Python `int` scalar representing the number of blocks for the
      MADE masks.
    exclusive: Python `bool` scalar representing whether to zero the diagonal of
      the mask, used for the first layer of a MADE.
    kernel_initializer: Initializer function for the weight matrix.
      If `None` (default), weights are initialized using the
      `tf.glorot_random_initializer`.
    reuse: Python `bool` scalar representing whether to reuse the weights of a
      previous layer by the same name.
    name: Python `str` used to describe ops managed by this function.
    *args: `tf.layers.dense` arguments.
    **kwargs: `tf.layers.dense` keyword arguments.

  Returns:
    Output tensor.

  Raises:
    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
      graph execution.
  """
    # TODO(b/67594795): Better support of dynamic shape.
    input_depth = inputs.shape.with_rank_at_least(1)[-1].value
    if input_depth is None:
        raise NotImplementedError(
            "Rightmost dimension must be known prior to graph execution.")

    mask = _gen_mask(num_blocks, input_depth, units,
                     MASK_EXCLUSIVE if exclusive else MASK_INCLUSIVE).T

    if kernel_initializer is None:
        kernel_initializer = init_ops.glorot_normal_initializer()

    def masked_initializer(shape, dtype=None, partition_info=None):
        return mask * kernel_initializer(shape, dtype, partition_info)

    with ops.name_scope(name, "masked_dense", [inputs, units, num_blocks]):
        layer = layers.Dense(units,
                             kernel_initializer=masked_initializer,
                             kernel_constraint=lambda x: mask * x,
                             name=name,
                             dtype=inputs.dtype.base_dtype,
                             _scope=name,
                             _reuse=reuse,
                             *args,
                             **kwargs)
        return layer.apply(inputs)
        decoder_cell = cell_list[0]
    else:
        decoder_cell = tf.contrib.rnn.MultiRNNCell(cell_list)
    
    # Helper
    
    # attention
    attention_mechanism = tf.contrib.seq2seq.LuongAttention(
        attention_hidden_size, encoder_outputs,
        memory_sequence_length=x_real_len,scale=True)
    decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
        decoder_cell, attention_mechanism,
        attention_layer_size=attention_output_size)
    
    
    projection_layer = layers_core.Dense(
        target_vocat_size, use_bias=False)
    
    
    
    # Dynamic decoding
    with tf.variable_scope("decode_layer"):
        helper = tf.contrib.seq2seq.TrainingHelper(
            decoder_emb_inp,sequence_length= y_len)
        decoder = tf.contrib.seq2seq.BasicDecoder(
            decoder_cell, helper, initial_state = decoder_cell.zero_state(dtype=tf.float32,batch_size=batch_size),
            output_layer=projection_layer)
       
        outputs, _,___  = tf.contrib.seq2seq.dynamic_decode(decoder)
        logits = outputs.rnn_output

        target_weights = tf.sequence_mask(
Example #18
0
    def __init__(self,
                 mode,
                 num_turns,
                 iterator,
                 params,
                 rev_vocab_table=None,
                 scope=None,
                 log_trainables=True):

        log.print_out("# creating %s graph ..." % mode)
        dtype = tf.float32

        self.mode = mode
        self.num_turns = num_turns - 1

        self.device_manager = DeviceManager()
        self.round_robin = RoundRobin(self.device_manager)
        self.num_gpus = min(params.num_gpus, self.device_manager.num_available_gpus())
        log.print_out("# number of gpus %d" % self.num_gpus)

        self.iterator = iterator

        with tf.variable_scope(scope or 'hred_graph', dtype=dtype):
            self.init_embeddings(params.vocab_file, params.vocab_h5, scope=scope)

            encoder_keep_prob, decoder_keep_prob = self.get_keep_probs(mode, params)  # this is for dropout
            if mode == tf.contrib.learn.ModeKeys.TRAIN:
                context_keep_prob = 1.0 - params.context_dropout_rate
            else:
                context_keep_prob = 1.0

            with tf.variable_scope(scope or "build_network"):
                with tf.variable_scope("decoder/output_projection"):
                    self.output_layer = layers_core.Dense(params.vocab_size, use_bias=False, name="output_projection")

            # self.sources = [tf.placeholder(tf.int32, shape=(None, None), name="src%d" % t) for t in
            #                 range(self.num_turns)]
            # self.source_sequence_lengths = [tf.placeholder(tf.int32, shape=(None,), name="src_len%d" % t)
            #                                 for t in range(self.num_turns)]
            self.batch_size = tf.size(self.iterator.source_sequence_lengths[0])

            # self.target_inputs = [tf.placeholder(tf.int32, shape=(None, None), name="tgt_input%d" % t)
            #                       for t in range(self.num_turns)]
            # self.target_outputs = [tf.placeholder(tf.int32, shape=(None, None), name="tgt_output%d" % t)
            #                        for t in range(self.num_turns)]
            # self.target_sequence_lengths = [tf.placeholder(tf.int32, shape=(None,), name="tgt_len%d" % t)
            #                                 for t in range(self.num_turns)]

            devices = self.round_robin.assign(3, base=self.num_gpus - 1)
            encoder_results, context_initial_state = self.__build_encoder(params,
                                                                          encoder_keep_prob, None)
            context_state = self.__build_context(params, encoder_results, context_initial_state,
                                                 context_keep_prob, devices[1])

            self.global_step = tf.Variable(0, trainable=False)
            self.use_scheduled_sampling = False
            if mode == tf.contrib.learn.ModeKeys.TRAIN:
                self.sampling_probability = tf.constant(params.scheduled_sampling_prob)  # this is for scheduled sampling
                self.sampling_probability = self._get_sampling_probability(params, self.global_step,
                                                                           self.sampling_probability)
                self.use_scheduled_sampling = params.scheduled_sampling_prob > 0
            elif mode == tf.contrib.learn.ModeKeys.EVAL:
                self.sampling_probability = tf.constant(0.0)

            logits, sample_id, _ = self.__build_decoder(params, mode, context_state,
                                                        decoder_keep_prob, devices[2])

            if mode != tf.contrib.learn.ModeKeys.INFER:
                with tf.device(self.device_manager.tail_gpu()):
                    loss = self.__compute_loss(logits)
            else:
                loss = None

            if mode == tf.contrib.learn.ModeKeys.TRAIN:
                self.train_loss = loss
                self.word_count = sum(
                    [tf.reduce_sum(self.iterator.source_sequence_lengths[t]) for t in range(self.num_turns)]) + \
                                  tf.reduce_sum(
                                      self.iterator.target_sequence_length)  # to compute the speed of the training
            elif mode == tf.contrib.learn.ModeKeys.EVAL:
                self.eval_loss = loss
            elif mode == tf.contrib.learn.ModeKeys.INFER:
                self.sample_words = rev_vocab_table.lookup(tf.to_int64(sample_id))

            if mode != tf.contrib.learn.ModeKeys.INFER:
                ## Count the number of predicted words for compute ppl.
                self.predict_count = tf.reduce_sum(self.iterator.target_sequence_length)

            trainables = tf.trainable_variables()

            if mode == tf.contrib.learn.ModeKeys.TRAIN:
                self.learning_rate = tf.constant(params.learning_rate)
                # decay
                self.learning_rate = self._get_learning_rate_decay(params, self.global_step, self.learning_rate)

                # Optimizer
                if params.optimizer.lower() == "sgd":
                    opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                    tf.summary.scalar("lr", self.learning_rate)
                elif params.optimizer.lower() == "adam":
                    opt = tf.train.AdamOptimizer(self.learning_rate)
                    tf.summary.scalar("lr", self.learning_rate)
                else:
                    raise ValueError('Unknown optimizer: ' + params.optimizer)

                # Gradients
                gradients = tf.gradients(
                    self.train_loss,
                    trainables,
                    colocate_gradients_with_ops=True)

                clipped_grads, grad_norm = tf.clip_by_global_norm(gradients, params.max_gradient_norm)
                grad_norm_summary = [tf.summary.scalar("grad_norm", grad_norm)]
                grad_norm_summary.append(
                    tf.summary.scalar("clipped_gradient", tf.global_norm(clipped_grads)))

                self.grad_norm = grad_norm

                self.update = opt.apply_gradients(
                    zip(clipped_grads, trainables), global_step=self.global_step)

                # Summary
                self.train_summary = tf.summary.merge([
                                                          tf.summary.scalar("lr", self.learning_rate),
                                                          tf.summary.scalar("train_loss", self.train_loss),
                                                      ] + grad_norm_summary)

            if mode == tf.contrib.learn.ModeKeys.INFER:
                self.infer_logits, self.sample_id = logits, sample_id
                self.infer_summary = tf.no_op()

            # Saver
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)

            # Print trainable variables
            if log_trainables:
                log.print_out("# Trainable variables")
                for trainable in trainables:
                    log.print_out("  %s, %s, %s" % (trainable.name, str(trainable.get_shape()),
                                                    trainable.op.device))
Example #19
0
 def __init__(self, name=None):
   super(Owner, self).__init__(name=name)
   self.first = self.track_layer(core.Dense(
       1, name="first_layer", use_bias=False))
    def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
        d, master_target, sess_config = self._get_test_objects(
            task_type, task_id, num_gpus)
        if task_type:
            # Multi-worker
            assert hasattr(d.extended,
                           '_cluster_spec') and d.extended._cluster_spec
            num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
            if CHIEF in d.extended._cluster_spec.as_dict():
                num_workers += 1
        else:
            # local
            num_workers = 1

        with ops.Graph().as_default(), \
             self.cached_session(target=master_target,
                                 config=sess_config) as sess, \
             d.scope():
            l = core.Dense(1, use_bias=False)

            def loss_fn(x):
                y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
                return y * y

            # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
            # multiple graphs (b/111216820).
            def grad_fn(x):
                loss = loss_fn(x)
                var_list = (variables.trainable_variables() +
                            ops.get_collection(
                                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
                grads = gradients.gradients(loss, var_list)
                ret = list(zip(grads, var_list))
                return ret

            def update(v, g):
                return v.assign_sub(0.05 * g, use_locking=True)

            one = d.broadcast(constant_op.constant([[1.]]))

            def step():
                """Perform one optimization step."""
                # Run forward & backward to get gradients, variables list.
                g_v = d.call_for_each_replica(grad_fn, args=(one, ))
                # Update the variables using the gradients and the update() function.
                before_list = []
                after_list = []
                for g, v in g_v:
                    fetched = d.read_var(v)
                    before_list.append(fetched)
                    with ops.control_dependencies([fetched]):
                        # TODO(yuefengz): support non-Mirrored variable as destinations.
                        g = d.extended.reduce_to(reduce_util.ReduceOp.SUM,
                                                 g,
                                                 destinations=v)
                        with ops.control_dependencies(
                                d.update(v, update, g, grouped=False)):
                            after_list.append(d.read_var(v))
                return before_list, after_list

            before_out, after_out = step()

            if context.num_gpus() < d.extended._num_gpus_per_worker:
                return True

            if (not task_type or multi_worker_util.is_chief(
                    d.extended._cluster_spec, task_type, task_id)):
                variables.global_variables_initializer().run()

            # Workers waiting for chief worker's initializing variables.
            self._init_condition.acquire()
            self._init_reached += 1
            while self._init_reached != num_workers:
                self._init_condition.wait()
            self._init_condition.notify_all()
            self._init_condition.release()

            for i in range(10):
                b, a = sess.run((before_out, after_out))
                if i == 0:
                    before, = b
                after, = a

            error_before = abs(before - 1)
            error_after = abs(after - 1)
            # Error should go down
            self.assertLess(error_after, error_before)
            return error_after < error_before
Example #21
0
 def __init__(self, name=None):
   super(LikeUserButNotSharing, self).__init__(name=name)
   self.first = self.track_layer(core.Dense(
       1, name="first_layer", use_bias=False))
   self.second = self.track_layer(core.Dense(
       1, name="second_layer", use_bias=False))
Example #22
0
    def build_mlp(self, hparams):
        hidden_word = []
        with tf.variable_scope("MLP_words") as scope:
            attention_W = layers_core.Dense(hparams.hidden_size,
                                            activation=tf.nn.relu,
                                            use_bias=False,
                                            name="attention_W")
            attention_V = layers_core.Dense(1,
                                            use_bias=False,
                                            name="attention_V")
            for q in [self.q1, self.q2]:
                weight = tf.nn.softmax(
                    tf.reduce_sum(
                        attention_V(attention_W(q['word_decoder_output'])),
                        -1))
                mask = tf.sequence_mask(q['words_len'],
                                        tf.shape(weight)[-1],
                                        dtype=tf.float32)
                weight = weight * mask
                weight = weight / (tf.reduce_sum(weight, -1)[:, None] +
                                   0.000001)
                context_hidden = tf.reduce_sum(
                    q['word_decoder_output'] * weight[:, :, None], 1)
                q['word_rep'] = context_hidden
        hidden_word = [
            self.q1['word_rep'], self.q2['word_rep'],
            self.q1['word_rep'] * self.q2['word_rep']
        ]

        hidden_word.append(self.q1['words_num'])

        with tf.variable_scope("MLP_chars") as scope:
            attention_W = layers_core.Dense(hparams.hidden_size,
                                            activation=tf.nn.relu,
                                            use_bias=False,
                                            name="attention_W")
            attention_V = layers_core.Dense(1,
                                            use_bias=False,
                                            name="attention_V")
            for q in [self.q1, self.q2]:
                weight = tf.nn.softmax(
                    tf.reduce_sum(
                        attention_V(attention_W(q['char_decoder_output'])),
                        -1))
                mask = tf.sequence_mask(q['chars_len'],
                                        tf.shape(weight)[-1],
                                        dtype=tf.float32)
                weight = weight * mask
                weight = weight / (tf.reduce_sum(weight, -1)[:, None] +
                                   0.000001)
                context_hidden = tf.reduce_sum(
                    q['char_decoder_output'] * weight[:, :, None], 1)
                q['char_rep'] = context_hidden
        hidden_char = [
            self.q1['char_rep'], self.q2['char_rep'],
            self.q1['char_rep'] * self.q2['char_rep']
        ]

        hidden_char.append(self.q1['chars_num'])

        with tf.variable_scope("MLP_words") as scope:
            layer_W = layers_core.Dense(hparams.hidden_size,
                                        activation=tf.nn.tanh,
                                        use_bias=False,
                                        name="ff_layer")
            hidden_word = tf.concat(hidden_word, -1)
            logits = layer_W(hidden_word)
            if hparams.dropout > 0.0 and self.mode == tf.contrib.learn.ModeKeys.TRAIN:
                logits = tf.nn.dropout(logits, 1 - hparams.dropout)
            layer_W = layers_core.Dense(1,
                                        use_bias=False,
                                        name="ff_layer_output")
            logits_word = layer_W(logits)[:, 0]
        with tf.variable_scope("MLP_chars") as scope:
            layer_W = layers_core.Dense(hparams.hidden_size,
                                        activation=tf.nn.tanh,
                                        use_bias=False,
                                        name="ff_layer")
            hidden_char = tf.concat(hidden_char, -1)
            logits = layer_W(hidden_char)
            if hparams.dropout > 0.0 and self.mode == tf.contrib.learn.ModeKeys.TRAIN:
                logits = tf.nn.dropout(logits, 1 - hparams.dropout)
            layer_W = layers_core.Dense(1,
                                        use_bias=False,
                                        name="ff_layer_output")
            logits_char = layer_W(logits)[:, 0]
        logits = logits_word + logits_char
        return logits
Example #23
0
 def __init__(self, name=None):
   super(MyNetwork, self).__init__(name=name)
   self.l1 = self.track_layer(core.Dense(1, use_bias=False))
    def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
        d, master_target = self._get_test_object(task_type, task_id, num_gpus)
        with ops.Graph().as_default(), \
             self.test_session(config=self._sess_config,
                               target=master_target) as sess, \
             d.scope():
            l = core.Dense(1,
                           use_bias=False,
                           name='gpu_%d' % d._num_gpus_per_worker)

            def loss_fn(x):
                y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
                return y * y

            # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
            # multiple graphs (b/111216820).
            def grad_fn(x):
                loss = loss_fn(x)
                var_list = (variables.trainable_variables() +
                            ops.get_collection(
                                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
                grads = gradients.gradients(loss, var_list)
                ret = list(zip(grads, var_list))
                return ret

            def update(v, g):
                return v.assign_sub(0.05 * g, use_locking=True)

            one = d.broadcast(constant_op.constant([[1.]]))

            def step():
                """Perform one optimization step."""
                # Run forward & backward to get gradients, variables list.
                g_v = d.call_for_each_tower(grad_fn, one)
                # Update the variables using the gradients and the update() function.
                before_list = []
                after_list = []
                for g, v in g_v:
                    fetched = d.read_var(v)
                    before_list.append(fetched)
                    with ops.control_dependencies([fetched]):
                        # TODO(yuefengz): support non-Mirrored variable as destinations.
                        g = d.reduce(variable_scope.VariableAggregation.SUM,
                                     g,
                                     destinations=v)
                        with ops.control_dependencies(
                                d.unwrap(d.update(v, update, g))):
                            after_list.append(d.read_var(v))
                return before_list, after_list

            before_out, after_out = step()

            if context.num_gpus() < d._num_gpus_per_worker:
                return True

            sess.run(variables.global_variables_initializer(),
                     options=self._run_options)

            for i in range(10):
                b, a = sess.run((before_out, after_out),
                                options=self._run_options)
                if i == 0:
                    before, = b
                after, = a

            error_before = abs(before - 1)
            error_after = abs(after - 1)
            # Error should go down
            self.assertLess(error_after, error_before)
            return error_after < error_before
Example #25
0
 def __init__(self):
   super(ParentNetwork, self).__init__()
   self.first = self.track_layer(
       core.Dense(1, use_bias=False, name="explicit_name"))
    def _testStepWithScheduledOutputTrainingHelper(self, sampling_probability,
                                                   use_next_input_layer,
                                                   use_auxiliary_inputs):
        sequence_length = [3, 4, 3, 1, 0]
        batch_size = 5
        max_time = 8
        input_depth = 7
        cell_depth = input_depth
        if use_next_input_layer:
            cell_depth = 6
        if use_auxiliary_inputs:
            auxiliary_input_depth = 4
            auxiliary_inputs = np.random.randn(
                batch_size, max_time, auxiliary_input_depth).astype(np.float32)
        else:
            auxiliary_inputs = None

        with self.test_session() as sess:
            inputs = np.random.randn(batch_size, max_time,
                                     input_depth).astype(np.float32)
            cell = core_rnn_cell.LSTMCell(cell_depth)
            sampling_probability = constant_op.constant(sampling_probability)

            next_input_layer = None
            if use_next_input_layer:
                next_input_layer = layers_core.Dense(input_depth,
                                                     use_bias=False)

            helper = helper_py.ScheduledOutputTrainingHelper(
                inputs=inputs,
                sequence_length=sequence_length,
                sampling_probability=sampling_probability,
                time_major=False,
                next_input_layer=next_input_layer,
                auxiliary_inputs=auxiliary_inputs)

            my_decoder = basic_decoder.BasicDecoder(
                cell=cell,
                helper=helper,
                initial_state=cell.zero_state(dtype=dtypes.float32,
                                              batch_size=batch_size))

            output_size = my_decoder.output_size
            output_dtype = my_decoder.output_dtype
            self.assertEqual(
                basic_decoder.BasicDecoderOutput(cell_depth,
                                                 tensor_shape.TensorShape([])),
                output_size)
            self.assertEqual(
                basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
                output_dtype)

            (first_finished, first_inputs,
             first_state) = my_decoder.initialize()
            (step_outputs, step_state, step_next_inputs,
             step_finished) = my_decoder.step(constant_op.constant(0),
                                              first_inputs, first_state)

            if use_next_input_layer:
                output_after_next_input_layer = next_input_layer(
                    step_outputs.rnn_output)

            batch_size_t = my_decoder.batch_size

            self.assertTrue(
                isinstance(first_state, core_rnn_cell.LSTMStateTuple))
            self.assertTrue(
                isinstance(step_state, core_rnn_cell.LSTMStateTuple))
            self.assertTrue(
                isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
            self.assertEqual((batch_size, cell_depth),
                             step_outputs[0].get_shape())
            self.assertEqual((batch_size, ), step_outputs[1].get_shape())
            self.assertEqual((batch_size, cell_depth),
                             first_state[0].get_shape())
            self.assertEqual((batch_size, cell_depth),
                             first_state[1].get_shape())
            self.assertEqual((batch_size, cell_depth),
                             step_state[0].get_shape())
            self.assertEqual((batch_size, cell_depth),
                             step_state[1].get_shape())

            sess.run(variables.global_variables_initializer())

            fetches = {
                "batch_size": batch_size_t,
                "first_finished": first_finished,
                "first_inputs": first_inputs,
                "first_state": first_state,
                "step_outputs": step_outputs,
                "step_state": step_state,
                "step_next_inputs": step_next_inputs,
                "step_finished": step_finished
            }
            if use_next_input_layer:
                fetches[
                    "output_after_next_input_layer"] = output_after_next_input_layer

            sess_results = sess.run(fetches)

            self.assertAllEqual([False, False, False, False, True],
                                sess_results["first_finished"])
            self.assertAllEqual([False, False, False, True, True],
                                sess_results["step_finished"])

            sample_ids = sess_results["step_outputs"].sample_id
            batch_where_not_sampling = np.where(np.logical_not(sample_ids))
            batch_where_sampling = np.where(sample_ids)

            auxiliary_inputs_to_concat = (
                auxiliary_inputs[:, 1] if use_auxiliary_inputs else np.array(
                    []).reshape(batch_size, 0).astype(np.float32))

            expected_next_sampling_inputs = np.concatenate(
                (sess_results["output_after_next_input_layer"]
                 [batch_where_sampling] if use_next_input_layer else
                 sess_results["step_outputs"].rnn_output[batch_where_sampling],
                 auxiliary_inputs_to_concat[batch_where_sampling]),
                axis=-1)
            self.assertAllClose(
                sess_results["step_next_inputs"][batch_where_sampling],
                expected_next_sampling_inputs)

            self.assertAllClose(
                sess_results["step_next_inputs"][batch_where_not_sampling],
                np.concatenate(
                    (np.squeeze(inputs[batch_where_not_sampling, 1], axis=0),
                     auxiliary_inputs_to_concat[batch_where_not_sampling]),
                    axis=-1))
Example #27
0
 def __init__(self):
   super(NetworkWithLayerChildren, self).__init__()
   self.first = self.track_layer(core.Dense(1, use_bias=False))
   self.second = self.track_layer(core.Dense(1, use_bias=False))
    def _testStepWithTrainingHelper(self, use_output_layer):
        sequence_length = [3, 4, 3, 1, 0]
        batch_size = 5
        max_time = 8
        input_depth = 7
        cell_depth = 10
        output_layer_depth = 3

        with self.test_session() as sess:
            inputs = np.random.randn(batch_size, max_time,
                                     input_depth).astype(np.float32)
            cell = core_rnn_cell.LSTMCell(cell_depth)
            helper = helper_py.TrainingHelper(inputs,
                                              sequence_length,
                                              time_major=False)
            if use_output_layer:
                output_layer = layers_core.Dense(output_layer_depth,
                                                 use_bias=False)
                expected_output_depth = output_layer_depth
            else:
                output_layer = None
                expected_output_depth = cell_depth
            my_decoder = basic_decoder.BasicDecoder(
                cell=cell,
                helper=helper,
                initial_state=cell.zero_state(dtype=dtypes.float32,
                                              batch_size=batch_size),
                output_layer=output_layer)
            output_size = my_decoder.output_size
            output_dtype = my_decoder.output_dtype
            self.assertEqual(
                basic_decoder.BasicDecoderOutput(expected_output_depth,
                                                 tensor_shape.TensorShape([])),
                output_size)
            self.assertEqual(
                basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
                output_dtype)

            (first_finished, first_inputs,
             first_state) = my_decoder.initialize()
            (step_outputs, step_state, step_next_inputs,
             step_finished) = my_decoder.step(constant_op.constant(0),
                                              first_inputs, first_state)
            batch_size_t = my_decoder.batch_size

            self.assertTrue(
                isinstance(first_state, core_rnn_cell.LSTMStateTuple))
            self.assertTrue(
                isinstance(step_state, core_rnn_cell.LSTMStateTuple))
            self.assertTrue(
                isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
            self.assertEqual((batch_size, expected_output_depth),
                             step_outputs[0].get_shape())
            self.assertEqual((batch_size, ), step_outputs[1].get_shape())
            self.assertEqual((batch_size, cell_depth),
                             first_state[0].get_shape())
            self.assertEqual((batch_size, cell_depth),
                             first_state[1].get_shape())
            self.assertEqual((batch_size, cell_depth),
                             step_state[0].get_shape())
            self.assertEqual((batch_size, cell_depth),
                             step_state[1].get_shape())

            if use_output_layer:
                # The output layer was accessed
                self.assertEqual(len(output_layer.variables), 1)

            sess.run(variables.global_variables_initializer())
            sess_results = sess.run({
                "batch_size": batch_size_t,
                "first_finished": first_finished,
                "first_inputs": first_inputs,
                "first_state": first_state,
                "step_outputs": step_outputs,
                "step_state": step_state,
                "step_next_inputs": step_next_inputs,
                "step_finished": step_finished
            })

            self.assertAllEqual([False, False, False, False, True],
                                sess_results["first_finished"])
            self.assertAllEqual([False, False, False, True, True],
                                sess_results["step_finished"])
            self.assertAllEqual(
                np.argmax(sess_results["step_outputs"].rnn_output, -1),
                sess_results["step_outputs"].sample_id)
Example #29
0
    def __init__(self,
                 hparams,
                 mode,
                 iterator,
                 source_vocab_table,
                 target_vocab_table,
                 reverse_target_vocab_table=None,
                 scope=None,
                 extra_args=None):
        """Create the model.

        Args:
          hparams: Hyperparameter configurations.
          mode: TRAIN | EVAL | INFER
          iterator: Dataset Iterator that feeds data.
          source_vocab_table: Lookup table mapping source words to ids.
          target_vocab_table: Lookup table mapping target words to ids.
          reverse_target_vocab_table: Lookup table mapping ids to target words. Only
            required in INFER mode. Defaults to None.
          scope: scope of the model.
          extra_args: model_helper.ExtraArgs, for passing customizable functions.

        """
        assert isinstance(iterator, iterator_utils.BatchedInput)
        self.iterator = iterator
        self.mode = mode
        self.src_vocab_table = source_vocab_table
        self.tgt_vocab_table = target_vocab_table

        self.src_vocab_size = hparams.src_vocab_size
        self.tgt_vocab_size = hparams.tgt_vocab_size
        self.num_layers = hparams.num_layers
        self.num_gpus = hparams.num_gpus
        self.time_major = hparams.time_major

        # extra_args: to make it flexible for adding external customizable code
        self.single_cell_fn = None
        if extra_args:
            self.single_cell_fn = extra_args.single_cell_fn

        # Initializer
        initializer = model_helper.get_initializer(
            hparams.init_op, hparams.random_seed, hparams.init_weight)
        tf.get_variable_scope().set_initializer(initializer)

        # Embeddings
        # TODO(zsy): embeddings on ps
        if hparams.job_name:
            ps_spec = hparams.ps_hosts.split(",")
            worker_spec = hparams.worker_hosts.split(",")

            cluster = tf.train.ClusterSpec({
                "ps": ps_spec,
                "worker": worker_spec})
            with tf.device(tf.train.replica_device_setter(cluster=cluster)):
                self.init_embeddings(hparams, scope)
        else:
            self.init_embeddings(hparams, scope)
        self.batch_size = tf.size(self.iterator.source_sequence_length)

        # Projection
        #TODO(zsy)
        with tf.device(tf.train.replica_device_setter(cluster=cluster)):
            with tf.variable_scope(scope or "build_network"):
                with tf.variable_scope("decoder/output_projection"):
                    self.output_layer = layers_core.Dense(
                        hparams.tgt_vocab_size, use_bias=False, name="output_projection")

        # Train graph
        res = self.build_graph(hparams, scope=scope)

        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(
                self.iterator.source_sequence_length) + tf.reduce_sum(
                self.iterator.target_sequence_length)
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = res[1]
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = reverse_target_vocab_table.lookup(
                tf.to_int64(self.sample_id))

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            # Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(
                self.iterator.target_sequence_length)

        # TODO(zsy)
        with tf.device(tf.train.replica_device_setter(cluster=cluster)):
            self.global_step = tf.Variable(0, trainable=False)
        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrage for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                tf.summary.scalar("lr", self.learning_rate)
            elif hparams.optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)

            # TODO(zsy): SyncReplicasOptimizer
            if hparams.sync_replicas:
                worker_spec = hparams.worker_hosts.split(",")
                if hparams.replicas_to_aggregate:
                    replicas_to_aggregate = hparams.replicas_to_aggregate
                else:
                    replicas_to_aggregate = len(worker_spec)

                opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate=replicas_to_aggregate,
                                                     total_num_replicas=len(worker_spec))

            # Gradients
            gradients = tf.gradients(
                self.train_loss,
                params,
                colocate_gradients_with_ops=hparams.colocate_gradients_with_ops)

            clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm = grad_norm

            self.update = opt.apply_gradients(
                zip(clipped_grads, params), global_step=self.global_step)

            # TODO(zsy): SyncReplicasOptimizer init op
            if hparams.sync_replicas:
                self.local_init_op = opt.local_step_init_op

                self.chief_local_init_op = opt.chief_init_op

                self.ready_for_local_init_op = opt.ready_for_local_init_op
                self.chief_queue_runner = opt.get_chief_queue_runner()
                self.sync_init_op = opt.get_init_tokens_op()

            # Summary
            self.train_summary = tf.summary.merge([
                                                      tf.summary.scalar("lr", self.learning_rate),
                                                      tf.summary.scalar("train_loss", self.train_loss),
                                                  ] + grad_norm_summary)

        if self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_summary = self._get_infer_summary(hparams)

        # Saver
        self.saver = tf.train.Saver(
            tf.global_variables(), max_to_keep=hparams.num_keep_ckpts)

        # Print trainable variables
        utils.print_out("# Trainable variables")
        for param in params:
            utils.print_out("  %s, %s, %s" % (param.name, str(param.get_shape()),
                                              param.op.device))

        self.init_op = tf.group(tf.global_variables_initializer(), tf.tables_initializer())
Example #30
0
 def testCallTensorDot(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
     inputs = random_ops.random_uniform((5, 4, 3), seed=1)
     outputs = dense(inputs)
     self.assertListEqual([5, 4, 2], outputs.get_shape().as_list())