def _testDynamicDecodeRNN(self, time_major, has_attention, with_alignment_history=False): encoder_sequence_length = np.array([3, 2, 3, 1, 1]) decoder_sequence_length = np.array([2, 0, 1, 2, 3]) batch_size = 5 decoder_max_time = 4 input_depth = 7 cell_depth = 9 attention_depth = 6 vocab_size = 20 end_token = vocab_size - 1 start_token = 0 embedding_dim = 50 max_out = max(decoder_sequence_length) output_layer = layers_core.Dense(vocab_size, use_bias=True, activation=None) beam_width = 3 with self.cached_session() as sess: batch_size_tensor = constant_op.constant(batch_size) embedding = np.random.randn(vocab_size, embedding_dim).astype(np.float32) cell = rnn_cell.LSTMCell(cell_depth) initial_state = cell.zero_state(batch_size, dtypes.float32) coverage_penalty_weight = 0.0 if has_attention: coverage_penalty_weight = 0.2 inputs = array_ops.placeholder_with_default( np.random.randn(batch_size, decoder_max_time, input_depth).astype(np.float32), shape=(None, None, input_depth)) tiled_inputs = beam_search_decoder.tile_batch( inputs, multiplier=beam_width) tiled_sequence_length = beam_search_decoder.tile_batch( encoder_sequence_length, multiplier=beam_width) attention_mechanism = attention_wrapper.BahdanauAttention( num_units=attention_depth, memory=tiled_inputs, memory_sequence_length=tiled_sequence_length) initial_state = beam_search_decoder.tile_batch( initial_state, multiplier=beam_width) cell = attention_wrapper.AttentionWrapper( cell=cell, attention_mechanism=attention_mechanism, attention_layer_size=attention_depth, alignment_history=with_alignment_history) cell_state = cell.zero_state(dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width) if has_attention: cell_state = cell_state.clone(cell_state=initial_state) bsd = beam_search_decoder.BeamSearchDecoder( cell=cell, embedding=embedding, start_tokens=array_ops.fill([batch_size_tensor], start_token), end_token=end_token, initial_state=cell_state, beam_width=beam_width, output_layer=output_layer, length_penalty_weight=0.0, coverage_penalty_weight=coverage_penalty_weight) final_outputs, final_state, final_sequence_lengths = ( decoder.dynamic_decode(bsd, output_time_major=time_major, maximum_iterations=max_out)) def _t(shape): if time_major: return (shape[1], shape[0]) + shape[2:] return shape self.assertIsInstance( final_outputs, beam_search_decoder.FinalBeamSearchDecoderOutput) self.assertIsInstance(final_state, beam_search_decoder.BeamSearchDecoderState) beam_search_decoder_output = final_outputs.beam_search_decoder_output self.assertEqual( _t((batch_size, None, beam_width)), tuple(beam_search_decoder_output.scores.get_shape().as_list())) self.assertEqual( _t((batch_size, None, beam_width)), tuple(final_outputs.predicted_ids.get_shape().as_list())) sess.run(variables.global_variables_initializer()) sess_results = sess.run({ 'final_outputs': final_outputs, 'final_state': final_state, 'final_sequence_lengths': final_sequence_lengths }) max_sequence_length = np.max( sess_results['final_sequence_lengths']) # A smoke test self.assertEqual( _t((batch_size, max_sequence_length, beam_width)), sess_results['final_outputs'].beam_search_decoder_output. scores.shape) self.assertEqual( _t((batch_size, max_sequence_length, beam_width)), sess_results['final_outputs'].beam_search_decoder_output. predicted_ids.shape)
def __init__(self): super(Dense, self).__init__() self.first = self.track_layer(core.Dense(1, use_bias=False))
def __init__(self, training, tokenized_data, batch_input, scope=None): """ Create the model. Args: training: A boolean value to indicate whether this model will be used for training. tokenized_data: The data object containing all information required for the model. scope: scope of the model. """ self.training = training self.batch_input = batch_input self.vocab_table = tokenized_data.vocab_table self.vocab_size = tokenized_data.vocab_size self.reverse_vocab_table = tokenized_data.reverse_vocab_table hparams = tokenized_data.hparams self.hparams = hparams self.num_layers = hparams.num_layers self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major # Initializer initializer = model_helper.get_initializer(hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.embedding = (model_helper.create_embbeding( vocab_size=self.vocab_size, embed_size=hparams.num_units, scope=scope)) # This batch_size might vary among each batch instance due to the bucketing and/or reach # the end of the training set. Treat it as size_of_the_batch. self.batch_size = tf.size(self.batch_input.source_sequence_length) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(self.vocab_size, use_bias=False, name="output_projection") # Training or inference graph print("# Building graph for the model ...") res = self.build_graph(hparams, scope=scope) if training: self.train_loss = res[1] self.word_count = tf.reduce_sum(self.batch_input.source_sequence_length) + \ tf.reduce_sum(self.batch_input.target_sequence_length) # Count the number of predicted words for compute perplexity. self.predict_count = tf.reduce_sum( self.batch_input.target_sequence_length) else: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = self.reverse_vocab_table.lookup( tf.to_int64(self.sample_id)) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients update operation for training the model. if training: self.learning_rate = tf.placeholder(tf.float32, shape=[], name='learning_rate') opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.train_loss, params, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) clipped_gradients, gradient_norm_summary = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("learning_rate", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + gradient_norm_summary) if not training: self.infer_summary = tf.no_op() # Saver self.saver = tf.train.Saver(tf.global_variables()) # Print trainable variables if training: print("# Trainable variables:") for param in params: print(" {}, {}, {}".format(param.name, str(param.get_shape()), param.op.device))
def __init__(self, name=None): super(Parent, self).__init__(name=name) self.first = self.track_layer(first) self.second = self.track_layer(core.Dense(1, use_bias=False))
def __init__(self, name=None): super(Compatible, self).__init__(name=name) self.first = self.track_layer(core.Dense(1, use_bias=False))
def __init__(self): super(FirstNetwork, self).__init__() self.first = self.track_layer(shared_layer) self.second = self.track_layer(core.Dense(1, use_bias=False))
def __init__(self, use_layer, name=None): super(User, self).__init__(name=name) self.first = self.track_layer(use_layer) self.second = self.track_layer(core.Dense( 1, name="second_layer", use_bias=False))
def __init__(self, rnn_cell, num_dims, num_hidden): self._num_dims = num_dims self._rnn_cell = rnn_cell self._fc_layer = tf_layers_core.Dense(units=num_dims + num_hidden) self._nade = Nade(num_dims, num_hidden)
def lstm_decoder_embedding(H, y, W_emb, opt, prefix = '', add_go = False, feed_previous=False, is_reuse= None, is_fed_h = True, is_sampling = False, is_softargmax = False, beam_width=None): #y len* batch * [0,V] H batch * h biasInit = tf.constant_initializer(0.001, dtype=tf.float32) #y = [tf.squeeze(y[:,i]) for i in xrange(y.get_shape()[1])] if add_go: y = tf.concat([tf.ones([opt.batch_size,1],dtype=tf.int32), y],1) y = tf.unstack(y, axis=1) # 1, . , . # make the size of hidden unit to be n_hid if not opt.additive_noise_lambda: H = layers.fully_connected(H, num_outputs = opt.n_hid, biases_initializer=biasInit, activation_fn = None, scope = prefix + 'lstm_decoder', reuse = is_reuse) H0 = tf.squeeze(H) H1 = (H0, tf.zeros_like(H0)) # initialize H and C # y_input = [tf.concat([tf.nn.embedding_lookup(W_emb, features),H0],1) for features in y] if is_fed_h \ else [tf.nn.embedding_lookup(W_emb, features) for features in y] with tf.variable_scope(prefix + 'lstm_decoder', reuse=True): cell = tf.contrib.rnn.LSTMCell(opt.n_hid) with tf.variable_scope(prefix + 'lstm_decoder', reuse=is_reuse): weightInit = tf.random_uniform_initializer(-0.001, 0.001) W = tf.get_variable('W', [opt.n_hid, opt.embed_size], initializer = weightInit) b = tf.get_variable('b', [opt.n_words], initializer = tf.random_uniform_initializer(-0.001, 0.001)) W_new = tf.matmul(W, W_emb, transpose_b=True) # h* V out_proj = (W_new,b) if feed_previous else None decoder_res = rnn_decoder_custom_embedding(emb_inp = y_input, initial_state = H1, cell = cell, embedding = W_emb, opt = opt, feed_previous = feed_previous, output_projection=out_proj, num_symbols = opt.n_words, is_fed_h = is_fed_h, is_softargmax = is_softargmax, is_sampling = is_sampling) outputs = decoder_res[0] if beam_width: #cell = rnn_cell.LSTMCell(cell_depth) #batch_size_tensor = constant_op.constant(opt.batch_size) initial_state = cell.zero_state(opt.batch_size* beam_width, tf.float32) #beam_search_decoder.tile_batch(H0, multiplier=beam_width) output_layer = layers_core.Dense(opt.n_words, use_bias=True, kernel_initializer = W_new, bias_initializer = b, activation=None) bsd = beam_search_decoder.BeamSearchDecoder( cell=cell, embedding=W_emb, start_tokens=array_ops.fill([opt.batch_size], dp.GO_ID), # go is 1 end_token=dp.EOS_ID, initial_state=initial_state, beam_width=beam_width, output_layer=output_layer, length_penalty_weight=0.0) #pdb.set_trace() final_outputs, final_state, final_sequence_lengths = ( decoder.dynamic_decode(bsd, output_time_major=False, maximum_iterations=opt.maxlen)) beam_search_decoder_output = final_outputs.beam_search_decoder_output #print beam_search_decoder_output.get_shape() logits = [nn_ops.xw_plus_b(out, W_new, b) for out in outputs] # hidden units to prob logits: out B*h W: h*E Wemb V*E if is_sampling: syn_sents = decoder_res[2] loss = sequence_loss(logits[:-1], syn_sents, [tf.cast(tf.ones_like(yy),tf.float32) for yy in syn_sents]) #loss = sequence_loss(logits[:-1], syn_sents, [tf.cast(tf.not_equal(yy,dp.PAD_ID),tf.float32) for yy in syn_sents]) #loss = sequence_loss(logits[:-1], syn_sents, [tf.concat([tf.ones([1]), tf.cast(tf.not_equal(yy,dp.PAD_ID),tf.float32)],0) for yy in syn_sents[:-1]]) # use one more pad after EOS syn_sents = tf.stack(syn_sents,1) else: syn_sents = [math_ops.argmax(l, 1) for l in logits] syn_sents = tf.stack(syn_sents,1) loss = sequence_loss(logits[:-1], y[1:], [tf.cast(tf.ones_like(yy),tf.float32) for yy in y[1:]]) #loss = sequence_loss(logits[:-1], y[1:], [tf.cast(tf.not_equal(yy,dp.PAD_ID),tf.float32) for yy in y[:-1]]) # use one more pad after EOS #outputs, _ = embedding_rnn_decoder(decoder_inputs = y, initial_state = H, cell = tf.contrib.rnn.BasicLSTMCell, num_symbols = opt.n_words, embedding_size = opt.embed_size, scope = prefix + 'lstm_decoder') # outputs : batch * len return loss, syn_sents, logits
def customized_slim_fully_connected( inputs, num_outputs, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None, task_id=1): """Adds a sparse fully connected layer. The weight matrix is masked. `fully_connected` creates a variable called `weights`, representing a fully connected weight matrix, which is multiplied by the `inputs` to produce a `Tensor` of hidden units. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the hidden units. Finally, if `activation_fn` is not `None`, it is applied to the hidden units as well. Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened prior to the initial matrix multiply by `weights`. Args: inputs: A tensor of at least rank 2 and static value for the last dimension; i.e. `[batch_size, depth]`, `[None, None, None, channels]`. num_outputs: Integer or long, the number of output units in the layer. activation_fn: Activation function. The default value is a ReLU function. Explicitly set it to None to skip it and maintain a linear activation. normalizer_fn: Normalization function to use instead of `biases`. If `normalizer_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. default set to None for no normalizer function normalizer_params: Normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collections per variable. outputs_collections: Collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: The tensor variable representing the result of the series of operations. Raises: ValueError: If x has rank less than 2 or if its last dimension is not set. """ if not isinstance(num_outputs, six.integer_types): raise ValueError('num_outputs should be int or long, got %s.' % (num_outputs, )) layer_variable_getter = _build_variable_getter({ 'bias': 'biases', 'kernel': 'weights' }) with variable_scope.variable_scope( scope, 'FC', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) layer = core_layers.Dense(units=num_outputs, activation=None, use_bias=not normalizer_fn and biases_initializer, kernel_initializer=weights_initializer, bias_initializer=biases_initializer, kernel_regularizer=weights_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, trainable=trainable, name=sc.name, dtype=inputs.dtype.base_dtype, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs) # Add variables to collections. _add_variable_to_collections(layer.kernel, variables_collections, 'weights') if layer.bias is not None: _add_variable_to_collections(layer.bias, variables_collections, 'biases') # Apply normalizer function / layer. if normalizer_fn is not None: if not normalizer_params: normalizer_params = {} with tf.variable_scope('task_{}'.format( task_id)): # Because there are multi-task problems outputs = normalizer_fn(outputs, **normalizer_params) # outputs = normalizer_fn(outputs, **normalizer_params) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def __init__(self, hparams, mode): self.vocab_size = hparams.to_vocab_size self.emb_dim = hparams.emb_dim self.num_units = hparams.units self.num_layers = hparams.num_layers self.learning_rate = tf.Variable(float(hparams.learning_rate), trainable=False) self.clip_value = hparams.clip_value self.max_seq_length = 50 if mode != tf.contrib.learn.ModeKeys.INFER: self.decoder_input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None]) self.decoder_input_length = tf.placeholder(dtype=tf.int32, shape=[None]) self.initial_state = tf.placeholder(dtype=tf.float32, shape=[None, None, None]) self.batch_size = tf.size(self.decoder_input_length) else: self.batch_size = 1 with tf.variable_scope("embedding") as scope: self.embeddings = tf.Variable( self.init_matrix([self.vocab_size, self.emb_dim])) with tf.variable_scope("projection") as scope: self.output_layer = layers_core.Dense(self.vocab_size) with tf.variable_scope("decoder") as scope: if self.num_layers > 1: decoder_cell = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.BasicLSTMCell(self.num_units) for _ in range(self.num_layers) ]) else: decoder_cell = tf.contrib.rnn.BasicLSTMCell(self.num_units) if mode != tf.contrib.learn.ModeKeys.INFER: initial_state = self.initial_state with tf.device("/cpu:0"): decoder_inputs = tf.nn.embedding_lookup( self.embeddings, self.decoder_input_ids) helper = tf.contrib.seq2seq.TrainingHelper( decoder_inputs, self.decoder_input_length) my_decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=helper, initial_state=initial_state, output_layer=self.output_layer) decoder_outputs, decoder_state, decoder_output_len = tf.contrib.seq2seq.dynamic_decode( my_decoder, maximum_iterations=self.max_seq_length * 2, swap_memory=True, ) self.sample_id = decoder_outputs.sample_id self.logits = decoder_outputs.rnn_output else: helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self.embeddings, [hparams.GO_ID], hparams.EOS_ID) initial_state = self.initial_state my_decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=helper, initial_state=initial_state, output_layer=self.output_layer) decoder_outputs, decoder_state, decoder_output_len = tf.contrib.seq2seq.dynamic_decode( my_decoder, maximum_iterations=self.max_seq_length * 2, swap_memory=True) self.sample_id = tf.unstack(decoder_outputs.sample_id, axis=0) with tf.variable_scope("rollout") as scope: self.given_decoder_inputs_ids = tf.placeholder(dtype=tf.int32, shape=[None, None]) self.given_decoder_length = tf.placeholder(dtype=tf.int32, shape=[None, None]) self.given_next_ids = tf.placeholder(dtype=tf.int32, shape=[None]) initial_state = self.initial_state with tf.device("/cpu:0"): given_decoder_inputs = tf.nn.embedding_lookup( self.embeddings, self.given_decoder_input_ids) helper1 = tf.contrib.seq2seq.TrainingHelper( decoder_inputs, self.decoder_input_length) my_decoder1 = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=helper, initial_state=initial_state, output_layer=self.output_layer) decoder1_outputs, decoder1_state, decoder1_output_len = tf.contrib.seq2seq.dynamic_decode( my_decoder1, maximum_iterations=self.max_seq_length * 2, swap_memory=True) helper2 = tf.contrib.seq2seq.SampleEmbeddingHelper( self.embeddings, self.given_next_ids, hparams.EOS_ID) my_decoder2 = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=helper, initial_state=decoder1_state, output_layer=self.output_layer) decoder2_outputs, decoder2_state, decoder2_output_len = tf.contrib.seq2seq.dynamic_decode( my_decoder2, maximum_iterations=self.max_seq_length * 2, swap_memory=True) if mode != tf.contrib.learn.ModeKeys.INFER: self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None]) self.target_weights = tf.placeholder(dtype=tf.float32, shape=[None, None]) self.rewards = tf.placeholder(dtype=tf.float32, shape=[None, None]) with tf.variable_scope("loss") as scope: crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.targets, logits=self.logits) self.pretrain_loss = tf.reduce_sum( crossent * self.target_weights) / tf.to_float( self.batch_size) self.train_loss = tf.reduce_sum( crossent * self.rewards) / tf.to_float(self.batch_size) if mode == tf.contrib.learn.ModeKeys.TRAIN: self.pretrain_global_step = tf.Variable(0, trainable=False) self.train_global_step = tf.Variable(0, trainable=False) with tf.variable_scope("train_op") as scope: optimizer = tf.train.AdamOptimizer(self.learning_rate) pretrain_gradients, pretrain_v = zip( *optimizer.compute_gradients(self.pretrain_loss)) pretrain_gradients, _ = tf.clip_by_global_norm( pretrain_gradients, self.clip_value) self.pretrain_train_op = optimizer.apply_gradients( zip(pretrain_gradients, pretrain_v), global_step=self.pretrain_global_step) train_gradients, train_v = zip( *optimizer.compute_gradients(self.train_loss)) train_gradients, _ = tf.clip_by_global_norm( train_gradients, self.clip_value) self.train_op = optimizer.apply_gradients( zip(train_gradients, train_v), global_step=self.train_global_step) self.saver = tf.train.Saver(tf.global_variables())
def _build_projection(self): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(Config.data.vocab_size, use_bias=False, name="output_projection")
def __init__(self, hparams, mode, iterator, source_vocab_table, target_vocab_table, reverse_target_vocab_table=None, scope=None, extra_args=None): """Create the model. Args: hparams: Hyperparameter configurations. mode: TRAIN | EVAL | INFER iterator: Dataset Iterator that feeds data. source_vocab_table: Lookup table mapping source words to ids. target_vocab_table: Lookup table mapping target words to ids. reverse_target_vocab_table: Lookup table mapping ids to target words. Only required in INFER mode. Defaults to None. scope: scope of the model. extra_args: model_helper.ExtraArgs, for passing customizable functions. """ self.supports_monolingual = False self.has_KL = False self.mode = mode self.src_vocab_table = source_vocab_table self.tgt_vocab_table = target_vocab_table self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major # extra_args: to make it flexible for adding external customizable code self.single_cell_fn = None if extra_args: self.single_cell_fn = extra_args.single_cell_fn # Set num layers self.num_encoder_layers = hparams.num_encoder_layers self.num_decoder_layers = hparams.num_decoder_layers assert self.num_encoder_layers assert self.num_decoder_layers # Set num residual layers if hasattr(hparams, "num_residual_layers"): # compatible common_test_utils self.num_encoder_residual_layers = hparams.num_residual_layers self.num_decoder_residual_layers = hparams.num_residual_layers else: self.num_encoder_residual_layers = hparams.num_encoder_residual_layers self.num_decoder_residual_layers = hparams.num_decoder_residual_layers # Initializer # initializer = model_helper.get_initializer( # hparams.init_op, hparams.random_seed, hparams.init_weight) # tf.get_variable_scope().set_initializer(initializer) # Embeddings self.init_embeddings(hparams, scope) assert isinstance(iterator, iterator_utils.BatchedInput) self._parse_iterator(iterator, hparams, scope=scope) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense( hparams.tgt_vocab_size, use_bias=False, name="output_projection") ## Train graph res = self.build_graph(hparams, scope=scope) self._logits = res[0] if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( self.source_sequence_length) + tf.reduce_sum( self.target_sequence_length) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup( tf.to_int64(self.sample_id)) if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.target_sequence_length) self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) # Gradients gradients = tf.gradients( self.train_loss, params, colocate_gradients_with_ops=hparams.colocate_gradients_with_ops) clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = grad_norm self.update = opt.apply_gradients( zip(clipped_grads, params), global_step=self.global_step) # Summary self._lr_summary = tf.summary.scalar("lr", self.learning_rate), self.train_summary = tf.summary.merge([ self._lr_summary, tf.summary.scalar("train_loss", self.train_loss), ] + grad_norm_summary) self._grad_norm_summary = grad_norm_summary if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Saver self.saver = tf.train.Saver( tf.global_variables(), max_to_keep=hparams.num_keep_ckpts) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
dtype=tf.float32) del encoder_outputs start_tokens = np.array([0]).repeat(BATCH_SIZE) end_token = -1 decoder_embedding = tf.Variable(tf.truncated_normal( shape=[ONEHOT_SIZE, ONEHOT_SIZE], stddev=0.1), name='decoder_embedding') from tensorflow.python.layers import core import tensorflow.contrib.layers as layers output_layer = core.Dense( ONEHOT_SIZE, activation=tf.nn.relu, use_bias=True, name="output_projection", kernel_initializer=layers.variance_scaling_initializer(factor=1.0, uniform=True, seed=1)) decoder_cell = tf.contrib.rnn.LSTMCell(HIDDEN_SIZE) helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embedding, start_tokens, end_token) decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, encoder_final_state, output_layer=output_layer) outputs, final_context_state = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, maximum_iterations=TIMESTEPS_OUT, swap_memory=True)
def testNoEagerActivityRegularizer(self): with context.eager_mode(): with self.assertRaisesRegexp(ValueError, 'activity_regularizer'): core_layers.Dense( 1, activity_regularizer=lambda *args, **kwargs: 0.)
def masked_dense(inputs, units, num_blocks=None, exclusive=False, kernel_initializer=None, reuse=None, name=None, *args, **kwargs): """A autoregressively masked dense layer. Analogous to `tf.layers.dense`. See [1] for detailed explanation. [1]: "MADE: Masked Autoencoder for Distribution Estimation." Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015. https://arxiv.org/abs/1502.03509 Arguments: inputs: Tensor input. units: Python `int` scalar representing the dimensionality of the output space. num_blocks: Python `int` scalar representing the number of blocks for the MADE masks. exclusive: Python `bool` scalar representing whether to zero the diagonal of the mask, used for the first layer of a MADE. kernel_initializer: Initializer function for the weight matrix. If `None` (default), weights are initialized using the `tf.glorot_random_initializer`. reuse: Python `bool` scalar representing whether to reuse the weights of a previous layer by the same name. name: Python `str` used to describe ops managed by this function. *args: `tf.layers.dense` arguments. **kwargs: `tf.layers.dense` keyword arguments. Returns: Output tensor. Raises: NotImplementedError: if rightmost dimension of `inputs` is unknown prior to graph execution. """ # TODO(b/67594795): Better support of dynamic shape. input_depth = inputs.shape.with_rank_at_least(1)[-1].value if input_depth is None: raise NotImplementedError( "Rightmost dimension must be known prior to graph execution.") mask = _gen_mask(num_blocks, input_depth, units, MASK_EXCLUSIVE if exclusive else MASK_INCLUSIVE).T if kernel_initializer is None: kernel_initializer = init_ops.glorot_normal_initializer() def masked_initializer(shape, dtype=None, partition_info=None): return mask * kernel_initializer(shape, dtype, partition_info) with ops.name_scope(name, "masked_dense", [inputs, units, num_blocks]): layer = layers.Dense(units, kernel_initializer=masked_initializer, kernel_constraint=lambda x: mask * x, name=name, dtype=inputs.dtype.base_dtype, _scope=name, _reuse=reuse, *args, **kwargs) return layer.apply(inputs)
decoder_cell = cell_list[0] else: decoder_cell = tf.contrib.rnn.MultiRNNCell(cell_list) # Helper # attention attention_mechanism = tf.contrib.seq2seq.LuongAttention( attention_hidden_size, encoder_outputs, memory_sequence_length=x_real_len,scale=True) decoder_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention_mechanism, attention_layer_size=attention_output_size) projection_layer = layers_core.Dense( target_vocat_size, use_bias=False) # Dynamic decoding with tf.variable_scope("decode_layer"): helper = tf.contrib.seq2seq.TrainingHelper( decoder_emb_inp,sequence_length= y_len) decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, helper, initial_state = decoder_cell.zero_state(dtype=tf.float32,batch_size=batch_size), output_layer=projection_layer) outputs, _,___ = tf.contrib.seq2seq.dynamic_decode(decoder) logits = outputs.rnn_output target_weights = tf.sequence_mask(
def __init__(self, mode, num_turns, iterator, params, rev_vocab_table=None, scope=None, log_trainables=True): log.print_out("# creating %s graph ..." % mode) dtype = tf.float32 self.mode = mode self.num_turns = num_turns - 1 self.device_manager = DeviceManager() self.round_robin = RoundRobin(self.device_manager) self.num_gpus = min(params.num_gpus, self.device_manager.num_available_gpus()) log.print_out("# number of gpus %d" % self.num_gpus) self.iterator = iterator with tf.variable_scope(scope or 'hred_graph', dtype=dtype): self.init_embeddings(params.vocab_file, params.vocab_h5, scope=scope) encoder_keep_prob, decoder_keep_prob = self.get_keep_probs(mode, params) # this is for dropout if mode == tf.contrib.learn.ModeKeys.TRAIN: context_keep_prob = 1.0 - params.context_dropout_rate else: context_keep_prob = 1.0 with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense(params.vocab_size, use_bias=False, name="output_projection") # self.sources = [tf.placeholder(tf.int32, shape=(None, None), name="src%d" % t) for t in # range(self.num_turns)] # self.source_sequence_lengths = [tf.placeholder(tf.int32, shape=(None,), name="src_len%d" % t) # for t in range(self.num_turns)] self.batch_size = tf.size(self.iterator.source_sequence_lengths[0]) # self.target_inputs = [tf.placeholder(tf.int32, shape=(None, None), name="tgt_input%d" % t) # for t in range(self.num_turns)] # self.target_outputs = [tf.placeholder(tf.int32, shape=(None, None), name="tgt_output%d" % t) # for t in range(self.num_turns)] # self.target_sequence_lengths = [tf.placeholder(tf.int32, shape=(None,), name="tgt_len%d" % t) # for t in range(self.num_turns)] devices = self.round_robin.assign(3, base=self.num_gpus - 1) encoder_results, context_initial_state = self.__build_encoder(params, encoder_keep_prob, None) context_state = self.__build_context(params, encoder_results, context_initial_state, context_keep_prob, devices[1]) self.global_step = tf.Variable(0, trainable=False) self.use_scheduled_sampling = False if mode == tf.contrib.learn.ModeKeys.TRAIN: self.sampling_probability = tf.constant(params.scheduled_sampling_prob) # this is for scheduled sampling self.sampling_probability = self._get_sampling_probability(params, self.global_step, self.sampling_probability) self.use_scheduled_sampling = params.scheduled_sampling_prob > 0 elif mode == tf.contrib.learn.ModeKeys.EVAL: self.sampling_probability = tf.constant(0.0) logits, sample_id, _ = self.__build_decoder(params, mode, context_state, decoder_keep_prob, devices[2]) if mode != tf.contrib.learn.ModeKeys.INFER: with tf.device(self.device_manager.tail_gpu()): loss = self.__compute_loss(logits) else: loss = None if mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = loss self.word_count = sum( [tf.reduce_sum(self.iterator.source_sequence_lengths[t]) for t in range(self.num_turns)]) + \ tf.reduce_sum( self.iterator.target_sequence_length) # to compute the speed of the training elif mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = loss elif mode == tf.contrib.learn.ModeKeys.INFER: self.sample_words = rev_vocab_table.lookup(tf.to_int64(sample_id)) if mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum(self.iterator.target_sequence_length) trainables = tf.trainable_variables() if mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(params.learning_rate) # decay self.learning_rate = self._get_learning_rate_decay(params, self.global_step, self.learning_rate) # Optimizer if params.optimizer.lower() == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif params.optimizer.lower() == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) else: raise ValueError('Unknown optimizer: ' + params.optimizer) # Gradients gradients = tf.gradients( self.train_loss, trainables, colocate_gradients_with_ops=True) clipped_grads, grad_norm = tf.clip_by_global_norm(gradients, params.max_gradient_norm) grad_norm_summary = [tf.summary.scalar("grad_norm", grad_norm)] grad_norm_summary.append( tf.summary.scalar("clipped_gradient", tf.global_norm(clipped_grads))) self.grad_norm = grad_norm self.update = opt.apply_gradients( zip(clipped_grads, trainables), global_step=self.global_step) # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + grad_norm_summary) if mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, self.sample_id = logits, sample_id self.infer_summary = tf.no_op() # Saver self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3) # Print trainable variables if log_trainables: log.print_out("# Trainable variables") for trainable in trainables: log.print_out(" %s, %s, %s" % (trainable.name, str(trainable.get_shape()), trainable.op.device))
def __init__(self, name=None): super(Owner, self).__init__(name=name) self.first = self.track_layer(core.Dense( 1, name="first_layer", use_bias=False))
def _test_minimize_loss_graph(self, task_type, task_id, num_gpus): d, master_target, sess_config = self._get_test_objects( task_type, task_id, num_gpus) if task_type: # Multi-worker assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER)) if CHIEF in d.extended._cluster_spec.as_dict(): num_workers += 1 else: # local num_workers = 1 with ops.Graph().as_default(), \ self.cached_session(target=master_target, config=sess_config) as sess, \ d.scope(): l = core.Dense(1, use_bias=False) def loss_fn(x): y = array_ops.reshape(l(x), []) - constant_op.constant(1.) return y * y # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for # multiple graphs (b/111216820). def grad_fn(x): loss = loss_fn(x) var_list = (variables.trainable_variables() + ops.get_collection( ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) grads = gradients.gradients(loss, var_list) ret = list(zip(grads, var_list)) return ret def update(v, g): return v.assign_sub(0.05 * g, use_locking=True) one = d.broadcast(constant_op.constant([[1.]])) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.call_for_each_replica(grad_fn, args=(one, )) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.read_var(v) before_list.append(fetched) with ops.control_dependencies([fetched]): # TODO(yuefengz): support non-Mirrored variable as destinations. g = d.extended.reduce_to(reduce_util.ReduceOp.SUM, g, destinations=v) with ops.control_dependencies( d.update(v, update, g, grouped=False)): after_list.append(d.read_var(v)) return before_list, after_list before_out, after_out = step() if context.num_gpus() < d.extended._num_gpus_per_worker: return True if (not task_type or multi_worker_util.is_chief( d.extended._cluster_spec, task_type, task_id)): variables.global_variables_initializer().run() # Workers waiting for chief worker's initializing variables. self._init_condition.acquire() self._init_reached += 1 while self._init_reached != num_workers: self._init_condition.wait() self._init_condition.notify_all() self._init_condition.release() for i in range(10): b, a = sess.run((before_out, after_out)) if i == 0: before, = b after, = a error_before = abs(before - 1) error_after = abs(after - 1) # Error should go down self.assertLess(error_after, error_before) return error_after < error_before
def __init__(self, name=None): super(LikeUserButNotSharing, self).__init__(name=name) self.first = self.track_layer(core.Dense( 1, name="first_layer", use_bias=False)) self.second = self.track_layer(core.Dense( 1, name="second_layer", use_bias=False))
def build_mlp(self, hparams): hidden_word = [] with tf.variable_scope("MLP_words") as scope: attention_W = layers_core.Dense(hparams.hidden_size, activation=tf.nn.relu, use_bias=False, name="attention_W") attention_V = layers_core.Dense(1, use_bias=False, name="attention_V") for q in [self.q1, self.q2]: weight = tf.nn.softmax( tf.reduce_sum( attention_V(attention_W(q['word_decoder_output'])), -1)) mask = tf.sequence_mask(q['words_len'], tf.shape(weight)[-1], dtype=tf.float32) weight = weight * mask weight = weight / (tf.reduce_sum(weight, -1)[:, None] + 0.000001) context_hidden = tf.reduce_sum( q['word_decoder_output'] * weight[:, :, None], 1) q['word_rep'] = context_hidden hidden_word = [ self.q1['word_rep'], self.q2['word_rep'], self.q1['word_rep'] * self.q2['word_rep'] ] hidden_word.append(self.q1['words_num']) with tf.variable_scope("MLP_chars") as scope: attention_W = layers_core.Dense(hparams.hidden_size, activation=tf.nn.relu, use_bias=False, name="attention_W") attention_V = layers_core.Dense(1, use_bias=False, name="attention_V") for q in [self.q1, self.q2]: weight = tf.nn.softmax( tf.reduce_sum( attention_V(attention_W(q['char_decoder_output'])), -1)) mask = tf.sequence_mask(q['chars_len'], tf.shape(weight)[-1], dtype=tf.float32) weight = weight * mask weight = weight / (tf.reduce_sum(weight, -1)[:, None] + 0.000001) context_hidden = tf.reduce_sum( q['char_decoder_output'] * weight[:, :, None], 1) q['char_rep'] = context_hidden hidden_char = [ self.q1['char_rep'], self.q2['char_rep'], self.q1['char_rep'] * self.q2['char_rep'] ] hidden_char.append(self.q1['chars_num']) with tf.variable_scope("MLP_words") as scope: layer_W = layers_core.Dense(hparams.hidden_size, activation=tf.nn.tanh, use_bias=False, name="ff_layer") hidden_word = tf.concat(hidden_word, -1) logits = layer_W(hidden_word) if hparams.dropout > 0.0 and self.mode == tf.contrib.learn.ModeKeys.TRAIN: logits = tf.nn.dropout(logits, 1 - hparams.dropout) layer_W = layers_core.Dense(1, use_bias=False, name="ff_layer_output") logits_word = layer_W(logits)[:, 0] with tf.variable_scope("MLP_chars") as scope: layer_W = layers_core.Dense(hparams.hidden_size, activation=tf.nn.tanh, use_bias=False, name="ff_layer") hidden_char = tf.concat(hidden_char, -1) logits = layer_W(hidden_char) if hparams.dropout > 0.0 and self.mode == tf.contrib.learn.ModeKeys.TRAIN: logits = tf.nn.dropout(logits, 1 - hparams.dropout) layer_W = layers_core.Dense(1, use_bias=False, name="ff_layer_output") logits_char = layer_W(logits)[:, 0] logits = logits_word + logits_char return logits
def __init__(self, name=None): super(MyNetwork, self).__init__(name=name) self.l1 = self.track_layer(core.Dense(1, use_bias=False))
def _test_minimize_loss_graph(self, task_type, task_id, num_gpus): d, master_target = self._get_test_object(task_type, task_id, num_gpus) with ops.Graph().as_default(), \ self.test_session(config=self._sess_config, target=master_target) as sess, \ d.scope(): l = core.Dense(1, use_bias=False, name='gpu_%d' % d._num_gpus_per_worker) def loss_fn(x): y = array_ops.reshape(l(x), []) - constant_op.constant(1.) return y * y # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for # multiple graphs (b/111216820). def grad_fn(x): loss = loss_fn(x) var_list = (variables.trainable_variables() + ops.get_collection( ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) grads = gradients.gradients(loss, var_list) ret = list(zip(grads, var_list)) return ret def update(v, g): return v.assign_sub(0.05 * g, use_locking=True) one = d.broadcast(constant_op.constant([[1.]])) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.call_for_each_tower(grad_fn, one) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.read_var(v) before_list.append(fetched) with ops.control_dependencies([fetched]): # TODO(yuefengz): support non-Mirrored variable as destinations. g = d.reduce(variable_scope.VariableAggregation.SUM, g, destinations=v) with ops.control_dependencies( d.unwrap(d.update(v, update, g))): after_list.append(d.read_var(v)) return before_list, after_list before_out, after_out = step() if context.num_gpus() < d._num_gpus_per_worker: return True sess.run(variables.global_variables_initializer(), options=self._run_options) for i in range(10): b, a = sess.run((before_out, after_out), options=self._run_options) if i == 0: before, = b after, = a error_before = abs(before - 1) error_after = abs(after - 1) # Error should go down self.assertLess(error_after, error_before) return error_after < error_before
def __init__(self): super(ParentNetwork, self).__init__() self.first = self.track_layer( core.Dense(1, use_bias=False, name="explicit_name"))
def _testStepWithScheduledOutputTrainingHelper(self, sampling_probability, use_next_input_layer, use_auxiliary_inputs): sequence_length = [3, 4, 3, 1, 0] batch_size = 5 max_time = 8 input_depth = 7 cell_depth = input_depth if use_next_input_layer: cell_depth = 6 if use_auxiliary_inputs: auxiliary_input_depth = 4 auxiliary_inputs = np.random.randn( batch_size, max_time, auxiliary_input_depth).astype(np.float32) else: auxiliary_inputs = None with self.test_session() as sess: inputs = np.random.randn(batch_size, max_time, input_depth).astype(np.float32) cell = core_rnn_cell.LSTMCell(cell_depth) sampling_probability = constant_op.constant(sampling_probability) next_input_layer = None if use_next_input_layer: next_input_layer = layers_core.Dense(input_depth, use_bias=False) helper = helper_py.ScheduledOutputTrainingHelper( inputs=inputs, sequence_length=sequence_length, sampling_probability=sampling_probability, time_major=False, next_input_layer=next_input_layer, auxiliary_inputs=auxiliary_inputs) my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=helper, initial_state=cell.zero_state(dtype=dtypes.float32, batch_size=batch_size)) output_size = my_decoder.output_size output_dtype = my_decoder.output_dtype self.assertEqual( basic_decoder.BasicDecoderOutput(cell_depth, tensor_shape.TensorShape([])), output_size) self.assertEqual( basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32), output_dtype) (first_finished, first_inputs, first_state) = my_decoder.initialize() (step_outputs, step_state, step_next_inputs, step_finished) = my_decoder.step(constant_op.constant(0), first_inputs, first_state) if use_next_input_layer: output_after_next_input_layer = next_input_layer( step_outputs.rnn_output) batch_size_t = my_decoder.batch_size self.assertTrue( isinstance(first_state, core_rnn_cell.LSTMStateTuple)) self.assertTrue( isinstance(step_state, core_rnn_cell.LSTMStateTuple)) self.assertTrue( isinstance(step_outputs, basic_decoder.BasicDecoderOutput)) self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape()) self.assertEqual((batch_size, ), step_outputs[1].get_shape()) self.assertEqual((batch_size, cell_depth), first_state[0].get_shape()) self.assertEqual((batch_size, cell_depth), first_state[1].get_shape()) self.assertEqual((batch_size, cell_depth), step_state[0].get_shape()) self.assertEqual((batch_size, cell_depth), step_state[1].get_shape()) sess.run(variables.global_variables_initializer()) fetches = { "batch_size": batch_size_t, "first_finished": first_finished, "first_inputs": first_inputs, "first_state": first_state, "step_outputs": step_outputs, "step_state": step_state, "step_next_inputs": step_next_inputs, "step_finished": step_finished } if use_next_input_layer: fetches[ "output_after_next_input_layer"] = output_after_next_input_layer sess_results = sess.run(fetches) self.assertAllEqual([False, False, False, False, True], sess_results["first_finished"]) self.assertAllEqual([False, False, False, True, True], sess_results["step_finished"]) sample_ids = sess_results["step_outputs"].sample_id batch_where_not_sampling = np.where(np.logical_not(sample_ids)) batch_where_sampling = np.where(sample_ids) auxiliary_inputs_to_concat = ( auxiliary_inputs[:, 1] if use_auxiliary_inputs else np.array( []).reshape(batch_size, 0).astype(np.float32)) expected_next_sampling_inputs = np.concatenate( (sess_results["output_after_next_input_layer"] [batch_where_sampling] if use_next_input_layer else sess_results["step_outputs"].rnn_output[batch_where_sampling], auxiliary_inputs_to_concat[batch_where_sampling]), axis=-1) self.assertAllClose( sess_results["step_next_inputs"][batch_where_sampling], expected_next_sampling_inputs) self.assertAllClose( sess_results["step_next_inputs"][batch_where_not_sampling], np.concatenate( (np.squeeze(inputs[batch_where_not_sampling, 1], axis=0), auxiliary_inputs_to_concat[batch_where_not_sampling]), axis=-1))
def __init__(self): super(NetworkWithLayerChildren, self).__init__() self.first = self.track_layer(core.Dense(1, use_bias=False)) self.second = self.track_layer(core.Dense(1, use_bias=False))
def _testStepWithTrainingHelper(self, use_output_layer): sequence_length = [3, 4, 3, 1, 0] batch_size = 5 max_time = 8 input_depth = 7 cell_depth = 10 output_layer_depth = 3 with self.test_session() as sess: inputs = np.random.randn(batch_size, max_time, input_depth).astype(np.float32) cell = core_rnn_cell.LSTMCell(cell_depth) helper = helper_py.TrainingHelper(inputs, sequence_length, time_major=False) if use_output_layer: output_layer = layers_core.Dense(output_layer_depth, use_bias=False) expected_output_depth = output_layer_depth else: output_layer = None expected_output_depth = cell_depth my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=helper, initial_state=cell.zero_state(dtype=dtypes.float32, batch_size=batch_size), output_layer=output_layer) output_size = my_decoder.output_size output_dtype = my_decoder.output_dtype self.assertEqual( basic_decoder.BasicDecoderOutput(expected_output_depth, tensor_shape.TensorShape([])), output_size) self.assertEqual( basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32), output_dtype) (first_finished, first_inputs, first_state) = my_decoder.initialize() (step_outputs, step_state, step_next_inputs, step_finished) = my_decoder.step(constant_op.constant(0), first_inputs, first_state) batch_size_t = my_decoder.batch_size self.assertTrue( isinstance(first_state, core_rnn_cell.LSTMStateTuple)) self.assertTrue( isinstance(step_state, core_rnn_cell.LSTMStateTuple)) self.assertTrue( isinstance(step_outputs, basic_decoder.BasicDecoderOutput)) self.assertEqual((batch_size, expected_output_depth), step_outputs[0].get_shape()) self.assertEqual((batch_size, ), step_outputs[1].get_shape()) self.assertEqual((batch_size, cell_depth), first_state[0].get_shape()) self.assertEqual((batch_size, cell_depth), first_state[1].get_shape()) self.assertEqual((batch_size, cell_depth), step_state[0].get_shape()) self.assertEqual((batch_size, cell_depth), step_state[1].get_shape()) if use_output_layer: # The output layer was accessed self.assertEqual(len(output_layer.variables), 1) sess.run(variables.global_variables_initializer()) sess_results = sess.run({ "batch_size": batch_size_t, "first_finished": first_finished, "first_inputs": first_inputs, "first_state": first_state, "step_outputs": step_outputs, "step_state": step_state, "step_next_inputs": step_next_inputs, "step_finished": step_finished }) self.assertAllEqual([False, False, False, False, True], sess_results["first_finished"]) self.assertAllEqual([False, False, False, True, True], sess_results["step_finished"]) self.assertAllEqual( np.argmax(sess_results["step_outputs"].rnn_output, -1), sess_results["step_outputs"].sample_id)
def __init__(self, hparams, mode, iterator, source_vocab_table, target_vocab_table, reverse_target_vocab_table=None, scope=None, extra_args=None): """Create the model. Args: hparams: Hyperparameter configurations. mode: TRAIN | EVAL | INFER iterator: Dataset Iterator that feeds data. source_vocab_table: Lookup table mapping source words to ids. target_vocab_table: Lookup table mapping target words to ids. reverse_target_vocab_table: Lookup table mapping ids to target words. Only required in INFER mode. Defaults to None. scope: scope of the model. extra_args: model_helper.ExtraArgs, for passing customizable functions. """ assert isinstance(iterator, iterator_utils.BatchedInput) self.iterator = iterator self.mode = mode self.src_vocab_table = source_vocab_table self.tgt_vocab_table = target_vocab_table self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.num_layers = hparams.num_layers self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major # extra_args: to make it flexible for adding external customizable code self.single_cell_fn = None if extra_args: self.single_cell_fn = extra_args.single_cell_fn # Initializer initializer = model_helper.get_initializer( hparams.init_op, hparams.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings # TODO(zsy): embeddings on ps if hparams.job_name: ps_spec = hparams.ps_hosts.split(",") worker_spec = hparams.worker_hosts.split(",") cluster = tf.train.ClusterSpec({ "ps": ps_spec, "worker": worker_spec}) with tf.device(tf.train.replica_device_setter(cluster=cluster)): self.init_embeddings(hparams, scope) else: self.init_embeddings(hparams, scope) self.batch_size = tf.size(self.iterator.source_sequence_length) # Projection #TODO(zsy) with tf.device(tf.train.replica_device_setter(cluster=cluster)): with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = layers_core.Dense( hparams.tgt_vocab_size, use_bias=False, name="output_projection") # Train graph res = self.build_graph(hparams, scope=scope) if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( self.iterator.source_sequence_length) + tf.reduce_sum( self.iterator.target_sequence_length) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = res[1] elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits, _, self.final_context_state, self.sample_id = res self.sample_words = reverse_target_vocab_table.lookup( tf.to_int64(self.sample_id)) if self.mode != tf.contrib.learn.ModeKeys.INFER: # Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.iterator.target_sequence_length) # TODO(zsy) with tf.device(tf.train.replica_device_setter(cluster=cluster)): self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. # Arrage for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) # TODO(zsy): SyncReplicasOptimizer if hparams.sync_replicas: worker_spec = hparams.worker_hosts.split(",") if hparams.replicas_to_aggregate: replicas_to_aggregate = hparams.replicas_to_aggregate else: replicas_to_aggregate = len(worker_spec) opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate=replicas_to_aggregate, total_num_replicas=len(worker_spec)) # Gradients gradients = tf.gradients( self.train_loss, params, colocate_gradients_with_ops=hparams.colocate_gradients_with_ops) clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = grad_norm self.update = opt.apply_gradients( zip(clipped_grads, params), global_step=self.global_step) # TODO(zsy): SyncReplicasOptimizer init op if hparams.sync_replicas: self.local_init_op = opt.local_step_init_op self.chief_local_init_op = opt.chief_init_op self.ready_for_local_init_op = opt.ready_for_local_init_op self.chief_queue_runner = opt.get_chief_queue_runner() self.sync_init_op = opt.get_init_tokens_op() # Summary self.train_summary = tf.summary.merge([ tf.summary.scalar("lr", self.learning_rate), tf.summary.scalar("train_loss", self.train_loss), ] + grad_norm_summary) if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Saver self.saver = tf.train.Saver( tf.global_variables(), max_to_keep=hparams.num_keep_ckpts) # Print trainable variables utils.print_out("# Trainable variables") for param in params: utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device)) self.init_op = tf.group(tf.global_variables_initializer(), tf.tables_initializer())
def testCallTensorDot(self): dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense') inputs = random_ops.random_uniform((5, 4, 3), seed=1) outputs = dense(inputs) self.assertListEqual([5, 4, 2], outputs.get_shape().as_list())