def get_cell(n_hidden): logging.info("Constructing cell of size={}".format(n_hidden)) if use_lstm: logging.info("Using LSTM cells") if initializer: cell = rnn_cell.LSTMCell(n_hidden, initializer=initializer) else: # to use peephole connections, cell clipping or a projection layer, use LSTMCell cell = rnn_cell.BasicLSTMCell(n_hidden) else: logging.info("Using GRU cells") cell = rnn_cell.GRUCell(n_hidden) if not forward_only and use_lstm and keep_prob < 1: logging.info("Adding dropout wrapper around lstm cells") cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob) if encoder == "bidirectional": logging.info("Bidirectional model") if init_backward: logging.info( "Use backward encoder state to initialize decoder state" ) cell = BidirectionalRNNCell([cell] * 2) elif encoder == "bow": logging.info("BOW model") if num_layers > 1: logging.info("Model with %d layers for the decoder" % num_layers) cell = BOWCell(rnn_cell.MultiRNNCell([cell] * num_layers)) else: cell = BOWCell(single_cell) elif num_layers > 1: logging.info("Model with %d layers" % num_layers) cell = rnn_cell.MultiRNNCell([cell] * num_layers) return cell
def __init__(self, vocab_size, buckets_or_sentence_length, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, model_type, use_lstm=True, num_samples=512, forward_only=False): """Create the model. This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. Args: vocab_size: size of the vocabulary. target_vocab_size: size of the target vocabulary. buckets_or_sentence_length: if using buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. else: number of the maximum number of words per sentence. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ # Need to determine if we're using buckets or not: if type(buckets_or_sentence_length) == list: self.buckets = buckets_or_sentence_length else: self.max_sentence_length = buckets_or_sentence_length self.vocab_size = vocab_size self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # Summary variables. NOTE: added these. # self.summary_op_learning_rate = tf.scalar_summary('learning rate', self.learning_rate) # self.summary_op_global_step = tf.scalar_summary('global step', self.global_step) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = rnn_cell.GRUCell(size) if use_lstm: single_cell = rnn_cell.BasicLSTMCell(size) cell = single_cell #i, j, f, o = array_ops.split(1, 4, concat) if num_layers > 1: cell = rnn_cell.MultiRNNCell( [single_cell] * num_layers) #cur_inp, array_ops.concat(1, new_states) # The seq2seq function: we use embedding for the input and attention (if applicable). if model_type is 'embedding_attention': def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode) else: # just build embedding model, I should probably change this to throw an error def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_rnn_seq2seq( encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model. try: encoder_range = self.buckets[-1][0] decoder_range = self.buckets[-1][1] except AttributeError: encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length for i in xrange(encoder_range): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(decoder_range + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. try: if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, self.vocab_size, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(self.buckets)): self.outputs[b] = [ tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs[b] ] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, self.vocab_size, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) except AttributeError: if forward_only: self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], True) self.losses = seq2seq.sequence_loss( self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function) # Project outputs for decoding if output_projection is not None: self.outputs = [ tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs ] else: self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], False) self.losses = (seq2seq.sequence_loss( self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function)) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() self.params = params # Hold onto this for Woz if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) try: for b in xrange(len(self.buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) except AttributeError: gradients = tf.gradients(self.losses, params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms = norm self.updates = opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, enc_out, target_vocab_size, buckets, embedding_size, hidden_size, num_layers, batch_size, use_lstm=False, num_samples=512, encoder="reverse", use_src_mask=False, maxout_layer=False, init_backward=False, variable_prefix=None, init_const=False, use_bow_mask=False, initializer=None): super(TFSeq2SeqSingleStepDecodingGraph, self).__init__(buckets, batch_size) self.target_vocab_size = target_vocab_size self.num_heads = 1 # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True), tf.device("/cpu:0"): w = tf.get_variable("proj_w", [hidden_size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss else: logging.info("Using maxout_layer=%d and full softmax loss" % maxout_layer) # Create the internal multi-layer cell for our RNN. if use_lstm: logging.info("Using LSTM cells of size={}".format(hidden_size)) if initializer: single_cell = rnn_cell.LSTMCell(hidden_size, initializer=initializer) else: # NOTE: to use peephole connections, cell clipping or a projection layer, use LSTMCell instead single_cell = rnn_cell.BasicLSTMCell(hidden_size) else: logging.info("Using GRU cells of size={}".format(hidden_size)) single_cell = rnn_cell.GRUCell(hidden_size) cell = single_cell if encoder == "bidirectional": logging.info("Bidirectional model") if init_backward: logging.info("Use backward encoder state to initialize decoder state") cell = BidirectionalRNNCell([single_cell] * 2) elif encoder == "bow": logging.info("BOW model") if num_layers > 1: logging.info("Model with %d layers for the decoder" % num_layers) cell = BOWCell(rnn_cell.MultiRNNCell([single_cell] * num_layers)) else: cell = BOWCell(single_cell) elif num_layers > 1: logging.info("Model with %d layers" % num_layers) cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) # List of placeholders deeper within the decoder (i.e. bucket dependent) self.enc_hidden = [] self.enc_hidden_features = [] self.enc_v = [] self.dec_attns = [] # Placeholder for last state if encoder == "bidirectional": if cell._cells[0]._state_is_tuple: dec_state_c = tf.placeholder(dtypes.float32, shape=[None, cell.fw_state_size/2], name="dec_state_c") dec_state_h = tf.placeholder(dtypes.float32, shape=[None, cell.fw_state_size/2], name="dec_state_h") self.dec_state = rnn_cell.LSTMStateTuple(dec_state_c, dec_state_h) else: self.dec_state = tf.placeholder(dtypes.float32, shape=[None, cell.fw_state_size], name="dec_state") elif encoder == "reverse" or encoder == "bow": if cell._state_is_tuple: dec_state_c = tf.placeholder(dtypes.float32, shape=[None, cell.state_size/2], name="dec_state_c") dec_state_h = tf.placeholder(dtypes.float32, shape=[None, cell.state_size/2], name="dec_state_h") self.dec_state = rnn_cell.LSTMStateTuple(dec_state_c, dec_state_h) else: self.dec_state = tf.placeholder(dtypes.float32, shape=[None, cell.state_size], name="dec_state") if use_src_mask: logging.info("Using source mask for decoder") self.src_mask = tf.placeholder(dtypes.float32, shape=[None, None], name="src_mask") else: self.src_mask = None if use_bow_mask: logging.info("Using bow mask for output layer") self.bow_mask = tf.placeholder(dtypes.float32, shape=[None, None], name="bow_mask") else: self.bow_mask = None # placeholder to indicate whether we're at the start of the target sentence self.start = tf.placeholder(tf.bool, name="start") # The seq2seq function: we use embedding for the input and attention. scope = None if variable_prefix is not None: scope = variable_prefix+"/embedding_attention_seq2seq" logging.info("Using variable scope {}".format(scope)) def seq2seq_f(bucket_enc_out, decoder_input): return self._tf_dec_embedding_attention_seq2seq(bucket_enc_out, decoder_input, self.dec_state, cell, target_vocab_size, embedding_size, output_projection=output_projection, encoder=encoder, src_mask=self.src_mask, maxout_layer=maxout_layer, init_backward=init_backward, start=self.start, scope=scope, init_const=init_const, bow_mask=self.bow_mask) self.dec_decoder_input = tf.placeholder(tf.int32, shape=[None], name="dec_decoder_input") self.outputs = self._tf_dec_model_with_buckets(enc_out, self.dec_decoder_input, buckets, seq2seq_f) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: # self.outputs contains outputs, new_attns, new_state in flattened list for b in xrange(len(buckets)): output = self.outputs[b][0] ''' Standard implementation uses following code here to get the previous output (_extract_argmax_and_embed): output = tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) However, we have to normalize during decoding using a softmax (and then taking a log to produce logprobs), as described in nn.py, def sampled_softmax_loss: "This operation is for training only. It is generally an underestimate of the full softmax loss. At inference time, you can compute full softmax probabilities with the expression `tf.nn.softmax(tf.matmul(inputs, weights) + biases)`." Note: tf.matmul(i, w) + b does the same as tf.nn.xw_plus_b(i, w, b) ''' output = tf.log(tf.nn.softmax(tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]))) self.outputs[b][0] = output else: logging.info("Apply full softmax") for b in xrange(len(buckets)): self.outputs[b][0] = tf.log(tf.nn.softmax(self.outputs[b][0])) # for update_buckets self.enc_out = enc_out self.seq2seq_f = seq2seq_f self.output_projection = output_projection
def __init__(self, source_vocab_size, buckets, embedding_size, hidden_size, num_layers, batch_size, use_lstm=False, num_samples=512, encoder="reverse", use_sequence_length=False, init_backward=False, variable_prefix=None, initializer=None): super(TFSeq2SeqEncodingGraph, self).__init__(buckets, batch_size) self.source_vocab_size = source_vocab_size self.num_heads = 1 # Create the internal multi-layer cell for our RNN. if use_lstm: logging.info("Using LSTM cells of size={}".format(hidden_size)) if initializer: single_cell = rnn_cell.LSTMCell(hidden_size, initializer=initializer) else: # NOTE: to use peephole connections, cell clipping or a projection layer, use LSTMCell instead single_cell = rnn_cell.BasicLSTMCell(hidden_size) else: logging.info("Using GRU cells of size={}".format(hidden_size)) single_cell = rnn_cell.GRUCell(hidden_size) cell = single_cell if encoder == "bidirectional": logging.info("Bidirectional model") if init_backward: logging.info("Use backward encoder state to initialize decoder state") cell = BidirectionalRNNCell([single_cell] * 2) elif encoder == "bow": logging.info("BOW model") if num_layers > 1: logging.info("Model with %d layers for the decoder" % num_layers) cell = BOWCell(rnn_cell.MultiRNNCell([single_cell] * num_layers)) else: cell = BOWCell(single_cell) elif num_layers > 1: logging.info("Model with %d layers" % num_layers) cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. scope = None if variable_prefix is not None: scope = variable_prefix+"/embedding_attention_seq2seq" logging.info("Using variable scope {}".format(scope)) def seq2seq_f(encoder_inputs, bucket_length): return self._tf_enc_embedding_attention_seq2seq(encoder_inputs, cell, source_vocab_size, embedding_size, encoder=encoder, sequence_length=self.sequence_length, bucket_length=bucket_length, init_backward=init_backward, bow_emb_size=hidden_size, scope=scope) # Feeds for inputs. self.encoder_inputs = [] self.sequence_lengths = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) if use_sequence_length: logging.info("Using sequence length for encoder") self.sequence_length = tf.placeholder(tf.int32, shape=[None], name="seq_len") else: self.sequence_length = None self.outputs = self._tf_enc_model_with_buckets(self.encoder_inputs, buckets, seq2seq_f) # for update_buckets self.seq2seq_f = seq2seq_f
def __init__(self, vocab_size, max_sentence_length, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, num_samples=512, forward_only=False, model_type): self.vocab_size = vocab_size self.batch_size = batch_size self.max_sentence_length = max_sentence_length self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # output projection for sampled softmax: output_projection = None softmax_loss_function = None if num_samples > 0 and num_samples < self.vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.vocab_size) softmax_loss_function = sampled_loss # single LSTM cell creation, use to build hidden layers single_cell = rnn_cell.BasicLSTMCell(size) cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) if model_type == 'embedding_attention': def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode) else: def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_rnn_seq2seq( encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode) # feeds for inputs are limited to max_sentence_length self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(max_sentence_length): self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(max_sentence_length + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # targets are decoder inputs shifted by one targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # training outputs and losses if forward_only: self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], True) self.losses = seq2seq.sequence_loss( self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function) # project outputs for decoding if output_projection is not None: self.outputs = [ tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs ] else: self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], False) self.losses = (seq2seq.sequence_loss( self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function)) # gradients and SGD update operation for training params = tf.trainable_variables() self.params = params if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.losses, params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms = norm self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, source_vocab_size, target_vocab_size, buckets, embedding_size, hidden_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False, dtype=tf.float32, opt_algorithm="sgd", encoder="reverse", use_sequence_length=False, use_src_mask=False, maxout_layer=False, init_backward=False, no_pad_symbol=False, variable_prefix=None, rename_variable_prefix=None, init_const=False, use_bow_mask=False, max_to_keep=0, keep_prob=1.0, initializer=None, legacy=False, train_align=None): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. dtype: the data type to use to store internal variables. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size with tf.variable_scope(variable_prefix or ""): self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.global_step = tf.Variable(0, trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.no_pad_symbol = no_pad_symbol # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w_t = tf.get_variable("proj_w", [self.target_vocab_size, hidden_size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) logging.info("Using output projection of shape (%d, %d)" % (hidden_size, self.target_vocab_size)) output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss(local_w_t, local_b, local_inputs, labels, num_samples, self.target_vocab_size), dtype) softmax_loss_function = sampled_loss else: logging.info("Using maxout_layer=%r and full softmax loss" % maxout_layer) # Create the internal multi-layer cell for our RNN. if use_lstm: logging.info("Using LSTM cells of size={}".format(hidden_size)) if initializer: single_cell = rnn_cell.LSTMCell(hidden_size, initializer=initializer) else: # NOTE: to use peephole connections, cell clipping or a projection layer, use LSTMCell instead single_cell = rnn_cell.BasicLSTMCell(hidden_size) else: logging.info("Using GRU cells of size={}".format(hidden_size)) single_cell = rnn_cell.GRUCell(hidden_size) cell = single_cell if encoder == "bidirectional": logging.info("Bidirectional model") if init_backward: logging.info( "Use backward encoder state to initialize decoder state") cell = BidirectionalRNNCell([single_cell] * 2) elif encoder == "bow": logging.info("BOW model") if not forward_only and use_lstm and keep_prob < 1: logging.info("Adding dropout wrapper around lstm cells") single_cell = rnn_cell.DropoutWrapper( single_cell, output_keep_prob=keep_prob) if num_layers > 1: logging.info("Model with %d layers for the decoder" % num_layers) cell = BOWCell( rnn_cell.MultiRNNCell([single_cell] * num_layers)) else: cell = BOWCell(single_cell) elif num_layers > 1: logging.info("Model with %d layers" % num_layers) cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. logging.info("Embedding size={}".format(embedding_size)) scope = None if variable_prefix is not None: scope = variable_prefix + "/embedding_attention_seq2seq" logging.info("Using variable scope {}".format(scope)) def seq2seq_f(encoder_inputs, decoder_inputs, do_decode, bucket_length): return embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=embedding_size, output_projection=output_projection, feed_previous=do_decode, dtype=dtype, encoder=encoder, sequence_length=self.sequence_length, bucket_length=bucket_length, src_mask=self.src_mask, maxout_layer=maxout_layer, init_backward=init_backward, bow_emb_size=hidden_size, scope=scope, init_const=init_const, bow_mask=self.bow_mask, keep_prob=keep_prob, legacy=legacy) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] self.alignments = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) if train_align is not None and not forward_only: for i in xrange(self.batch_size): self.alignments.append( tf.placeholder(tf.float32, shape=[None], name="align{0}".format(i))) if use_sequence_length is True: logging.info("Using sequence length for encoder") self.sequence_length = tf.placeholder(tf.int32, shape=[None], name="seq_len") else: self.sequence_length = None if use_src_mask: logging.info("Using source mask for decoder") self.src_mask = tf.placeholder(tf.float32, shape=[None, None], name="src_mask") else: self.src_mask = None if use_bow_mask: logging.info("Using bow mask for output layer") self.bow_mask = tf.placeholder(tf.float32, shape=[None, None], name="bow_mask") else: self.bow_mask = None # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. if forward_only: self.outputs, self.losses = model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, True, z), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): # This is similar to what is done in the loop function (where xw_plus_b is used instead of matmul). # The loop function also takes the argmax, but the result is not saved, we pass the logits # and take the argmax again in the vanilla decoder. self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, False, z), softmax_loss_function=softmax_loss_function, alignments=self.alignments) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] if opt_algorithm == "sgd": logging.info("Using optimizer GradientDescentOptimizer") opt = tf.train.GradientDescentOptimizer(self.learning_rate) elif opt_algorithm == "adagrad": print("Using optimizer AdagradOptimizer") lr = 3.0 init_acc = 0.1 opt = tf.train.AdagradOptimizer(lr, init_acc) elif opt_algorithm == "adadelta": print("Using optimizer AdadeltaOptimizer") rho = 0.95 epsilon = 1e-6 opt = tf.train.AdadeltaOptimizer(rho=rho, epsilon=epsilon) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) if variable_prefix: # save only the variables that belong to the prefix logging.info("Using variable prefix={}".format(variable_prefix)) self.saver = tf.train.Saver( { v.op.name: v for v in tf.global_variables() if v.op.name.startswith(variable_prefix) }, max_to_keep=max_to_keep, write_version=saver_pb2.SaverDef.V1) else: self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_to_keep, write_version=saver_pb2.SaverDef.V1) if rename_variable_prefix: # create a saver that explicitly stores model variables with a prefix logging.info("Saving model with new prefix={}".format( rename_variable_prefix)) self.saver_prefix = tf.train.Saver( { v.op.name.replace(variable_prefix, rename_variable_prefix): v for v in tf.global_variables() }, write_version=saver_pb2.SaverDef.V1)
def __init__(self, config, variable_prefix, is_training, use_log_probs=False, optimizer="sgd", rename_variable_prefix=None): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps hidden_size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) self.global_step = tf.Variable(0, trainable=False) # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper. lstm_cell = rnn_cell.BasicLSTMCell(hidden_size, forget_bias=0.0) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) if is_training or use_log_probs: logging.info("Using LSTM cells of size={}".format(hidden_size)) logging.info("Model with %d layer(s)" % config.num_layers) logging.info("Model with %i unrolled step(s)" % config.num_steps) self._initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, hidden_size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) outputs = [] state = self._initial_state # Simplified version of tensorflow.models.rnn.rnn.py's rnn(). # This builds an unrolled LSTM for tutorial purposes only. # In general, use the rnn() or state_saving_rnn() from rnn.py. #with tf.variable_scope("RNN"): # for time_step in range(num_steps): # if time_step > 0: tf.get_variable_scope().reuse_variables() # (cell_output, state) = cell(inputs[:, time_step, :], state) # outputs.append(cell_output) inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs) ] outputs, state = rnn.rnn(cell, inputs, initial_state=self._initial_state) self._final_state = state output = tf.reshape(tf.concat(1, outputs), [-1, hidden_size]) softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.matmul(output, softmax_w) + softmax_b if use_log_probs: logging.info("Softmax") probs = tf.nn.softmax(logits) self._log_probs = tf.log(probs) else: loss = tf.nn.seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])]) self._cost = cost = tf.reduce_sum(loss) / batch_size if is_training: self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) if optimizer == "adadelta": self.lr = 1.0 rho = 0.95 epsilon = 1e-6 logging.info("Use AdaDeltaOptimizer with lr={}".format( self.lr)) optimizer = tf.train.AdadeltaOptimizer(self.lr, rho=rho, epsilon=epsilon) elif optimizer == "adagrad": self.lr = 0.5 logging.info("Use AdaGradOptimizer with lr={}".format(self.lr)) optimizer = tf.train.AdagradOptimizer(self.lr) elif optimizer == "adam": # Default values are same as in Keras library logging.info("Use AdamOptimizer with default values") optimizer = tf.train.AdamOptimizer() elif optimizer == "rmsprop": self.lr = 0.5 logging.info("Use RMSPropOptimizer with lr={}".format(self.lr)) optimizer = tf.train.RMSPropOptimizer(self.lr) else: logging.info("Use GradientDescentOptimizer") optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step) self.saver = tf.train.Saver( { v.op.name: v for v in tf.all_variables() if v.op.name.startswith(variable_prefix) }, max_to_keep=2) if rename_variable_prefix: self.saver_prefix = tf.train.Saver({ v.op.name.replace(variable_prefix, rename_variable_prefix): \ v for v in tf.all_variables() if v.op.name.startswith(variable_prefix) }, max_to_keep=2)