def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False, scope_name='seq2seq', dtype=tf.float32): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. dtype: the data type to use to store internal variables. """ self.scope_name = scope_name with tf.variable_scope(self.scope_name): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable( float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.dummy_dialogs = [] # [TODO] load dummy sentences # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_samples, num_classes=self.target_vocab_size), dtype) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. def single_cell(): return tf.contrib.rnn.GRUCell(size) if use_lstm: def single_cell(): return tf.contrib.rnn.BasicLSTMCell(size) cell = single_cell() if num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)]) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, feed_previous): return tf_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=feed_previous, #do_decode, dtype=dtype) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] # for reinforcement learning # self.force_dec_input = tf.placeholder(tf.bool, name="force_dec_input") # self.en_output_proj = tf.placeholder(tf.bool, name="en_output_proj") # Training outputs and losses. if forward_only: self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # # Training outputs and losses. # self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets( # self.encoder_inputs, self.decoder_inputs, targets, # self.target_weights, buckets, # lambda x, y: seq2seq_f(x, y, tf.where(self.force_dec_input, False, True)), # softmax_loss_function=softmax_loss_function # ) # # If we use output projection, we need to project outputs for decoding. # # if output_projection is not None: # for b in xrange(len(buckets)): # self.outputs[b] = [ # control_flow_ops.cond( # self.en_output_proj, # lambda: tf.matmul(output, output_projection[0]) + output_projection[1], # lambda: output # ) # for output in self.outputs[b] # ] # Gradients and SGD update operation for training the model. params = tf.trainable_variables() # if not forward_only: self.gradient_norms = [] self.updates = [] self.advantage = [tf.placeholder(tf.float32, name="advantage_%i" % i) for i in xrange(len(buckets))] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): # self.losses[b] = tf.subtract(self.losses[b], self.advantage[b]) gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) all_variables = tf.global_variables() all_variables = [k for k in tf.global_variables() if k.name.startswith(self.scope_name)] self.saver = tf.train.Saver(all_variables)
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False, scope_name='seq2seq', dtype=tf.float32): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. dtype: the data type to use to store internal variables. """ self.scope_name = scope_name with tf.variable_scope(self.scope_name): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.dummy_dialogs = [] #self.dummy_dialogs = [["what", "?"], ["No","comment"], ["i", "'", "m", "sorry"], ["i", "'", "m", "not"], ["yeah."], ["okay", "."], ["why"], ["no", "."], ["yes"], ["fine"], ["yes"], ["sure"]]# [TODO] load dummy sentences #encode dummies #self.dummy_dialogs = [str.encode(dum) for dum in self.dummy_dialogs] # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_samples, num_classes=self.target_vocab_size), dtype) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. def single_cell(): return tf.contrib.rnn.GRUCell(size) if use_lstm: def single_cell(): return tf.contrib.rnn.BasicLSTMCell(size) cell = single_cell() if num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell( [single_cell() for _ in range(num_layers)]) #load training corpus def get_gold(workspace=args.workspace): data_dir = "%s/data" % (workspace) full_path = str( sys.path[-1]) + "/" + data_dir + "/train/chat.txt.gz" print(full_path) with gzip.open(full_path, 'rb') as zi: test_sentences = zi.read() test_sentences = test_sentences.split("\n") zi.close() return test_sentences # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, feed_previous): return tf_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=feed_previous, #do_decode, dtype=dtype) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # for reinforcement learning self.force_dec_input = tf.placeholder(tf.bool, name="force_dec_input") self.en_output_proj = tf.placeholder(tf.bool, name="en_output_proj") # Training outputs and losses. if forward_only: self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # # Training outputs and losses. # self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets( # self.encoder_inputs, self.decoder_inputs, targets, # self.target_weights, buckets, # lambda x, y: seq2seq_f(x, y, tf.where(self.force_dec_input, False, True)), # softmax_loss_function=softmax_loss_function # ) # # If we use output projection, we need to project outputs for decoding. # # if output_projection is not None: # for b in xrange(len(buckets)): # self.outputs[b] = [ # control_flow_ops.cond( # self.en_output_proj, # lambda: tf.matmul(output, output_projection[0]) + output_projection[1], # lambda: output # ) # for output in self.outputs[b] # ] # Gradients and SGD update operation for training the model. params = tf.trainable_variables() # if not forward_only: self.gradient_norms = [] self.updates = [] self.advantage = [ tf.placeholder(tf.float32, name="advantage_%i" % i) for i in xrange(len(buckets)) ] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): self.losses[b] = tf.subtract(self.losses[b], self.advantage[b]) gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) all_variables = tf.global_variables() all_variables = [ k for k in tf.global_variables() if k.name.startswith(self.scope_name) ] print("updating new weights...................") self.saver = tf.train.Saver(all_variables) data_ = get_gold()
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, scope_name='seq2seq', dtype=tf.float32): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. # forward_only: if set, we do not construct the backward pass in the model. dtype: the data type to use to store internal variables. """ self.scope_name = scope_name with tf.variable_scope(self.scope_name): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.dummy_dialogs = [] # [TODO] load dummy sentences # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w = tf.get_variable("proj_w", [size, self.target_vocab_size], dtype=dtype) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss(local_w_t, local_b, local_inputs, labels, num_samples, self.target_vocab_size), dtype) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = tf.nn.rnn_cell.GRUCell(size) if use_lstm: single_cell = tf.nn.rnn_cell.BasicLSTMCell(size) cell = single_cell if num_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode, dtype=dtype) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[batch_size], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[batch_size], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(dtype, shape=[batch_size], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # for reinforcement learning self.force_dec_input = tf.placeholder(tf.bool, name="force_dec_input") self.en_output_proj = tf.placeholder(tf.bool, name="en_output_proj") # Training outputs and losses. # if forward_only: # testing or reinforcement training self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f( x, y, tf.select(self.force_dec_input, False, True)), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. # if output_projection is not None: self.projected_outputs = {} for b in xrange(len(buckets)): self.outputs[b] = [ control_flow_ops.cond( self.en_output_proj, lambda: tf.matmul(output, output_projection[ 0]) + output_projection[1], lambda: output) for output in self.outputs[b] ] # else: # normal training # self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets( # self.encoder_inputs, self.decoder_inputs, targets, # self.target_weights, buckets, # lambda x, y: seq2seq_f(x, y, False), # softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) # self.saver = tf.train.Saver(tf.all_variables()) all_variables = [ k for k in tf.all_variables() if k.name.startswith(self.scope_name) ] self.saver = tf.train.Saver(all_variables)