def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = core_rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = core_rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = core_rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size, state_is_tuple=True) self.cell = cell = core_rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name="input_data") self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name="targets") self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) print "seq_length = ", args.seq_length, "embedding_lookup = ", tf.nn.embedding_lookup(embedding, self.input_data) #inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = tf.split( tf.nn.embedding_lookup(embedding, self.input_data) , args.seq_length,1) print "inputs 1:",inputs inputs = [tf.squeeze(input_, [1]) for input_ in inputs] print "inputs 2:",inputs def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) # yonghua # inputs, initial_state, cell, scope outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') #sys.stdout.write("outputs : %s\tlast_state : %s" % (outputs, last_state)) #output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) output = tf.reshape(tf.concat(outputs,1), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits, name="prob_results") loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False,name="LR_") tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, vocabularySize, config_param): self.vocabularySize = vocabularySize self.config = config_param self._inputX = tf.placeholder( tf.int32, [self.config.batch_size, self.config.sequence_size], "InputsX") self._inputTargetsY = tf.placeholder( tf.int32, [self.config.batch_size, self.config.sequence_size], "InputTargetsY") #Converting Input in an Embedded form with tf.device( "/cpu:0"): #Tells Tensorflow what GPU to use specifically embedding = tf.get_variable( "embedding", [self.vocabularySize, self.config.embeddingSize]) embeddingLookedUp = tf.nn.embedding_lookup(embedding, self._inputX) inputs = tf.split(axis=1, num_or_size_splits=self.config.sequence_size, value=embeddingLookedUp) inputTensorsAsList = [tf.squeeze(input_, [1]) for input_ in inputs] #Define Tensor RNN singleRNNCell = rnn_cell.BasicRNNCell(self.config.hidden_size) self.multilayerRNN = rnn_cell.MultiRNNCell([singleRNNCell] * self.config.num_layers) self._initial_state = self.multilayerRNN.zero_state( self.config.batch_size, tf.float32) #Defining Logits hidden_layer_output, last_state = rnn.static_rnn( self.multilayerRNN, inputTensorsAsList, initial_state=self._initial_state) hidden_layer_output = tf.reshape( tf.concat(axis=1, values=hidden_layer_output), [-1, self.config.hidden_size]) self._logits = tf.nn.xw_plus_b( hidden_layer_output, tf.get_variable("softmax_w", [self.config.hidden_size, self.vocabularySize]), tf.get_variable("softmax_b", [self.vocabularySize])) self._predictionSoftmax = tf.nn.softmax(self._logits) #Define the loss loss = seq2seq.sequence_loss_by_example( [self._logits], [tf.reshape(self._inputTargetsY, [-1])], [tf.ones([self.config.batch_size * self.config.sequence_size])], self.vocabularySize) self._cost = tf.div(tf.reduce_sum(loss), self.config.batch_size) self._final_state = last_state
def build_input_sequence(self, gpu_id=0): #embedding layer self.__build_embedding_layer__() with get_new_variable_scope('rnn_lstm') as rnn_scope: single_cell = rnn_cell.LSTMCell(self.hidden_size, use_peepholes=True, state_is_tuple=True) single_cell = rnn_cell.DropoutWrapper( single_cell, input_keep_prob=self.keep_prob, output_keep_prob=self.keep_prob) cell = rnn_cell.MultiRNNCell([single_cell] * self.num_layers, state_is_tuple=True) self.state_list[gpu_id], self.output_list[gpu_id] = dynamic_rnn( cell, self.input_embedding, self.split_seqLengths[gpu_id], dtype=tf.float32) if self.input_params is None: self.input_params = tf.trainable_variables()[1:]
def get_dec_cell(self, cell_size): cell = core_rnn_cell.GRUCell(cell_size) # TODO if True: num_layers = 2 ''' if self.phase_train: cell = core_rnn_cell.DropoutWrapper( cell, input_keep_prob=0.5) ''' cell = core_rnn_cell.MultiRNNCell([cell] * num_layers) ''' if self.phase_train: cell = core_rnn_cell.DropoutWrapper( cell, output_keep_prob=0.5) ''' else: if self.phase_train: cell = core_rnn_cell.DropoutWrapper(cell, input_keep_prob=0.5, output_keep_prob=0.5) return cell
def __init__(self, is_training, config, input_): self._input = input_ batch_size = input_.batch_size num_steps = input_.num_steps size = config.hidden_size vocab_size = config.vocab_size # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper. lstm_cell = core_rnn_cell.BasicLSTMCell(num_units=size, state_is_tuple=True) if is_training and config.keep_prob < 1: lstm_cell = tf.contrib.rnn.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = core_rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=True) self._initial_state = cell.zero_state(batch_size, data_type()) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size], dtype=data_type()) inputs = tf.nn.embedding_lookup(embedding, input_.input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) outputs = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) output = tf.reshape(tf.concat(outputs, 1), [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size], dtype=data_type()) softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) logits = tf.matmul(output, softmax_w) + softmax_b loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], [tf.reshape(input_.targets, [-1])], [tf.ones([batch_size * num_steps], dtype=data_type())]) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = state if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) self._new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr)
def __init__(self, pre_trained_seq2seq, pre_trained_backward, vocab_size, buckets, layer_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only = False, dtype= tf.float32): """Create a Model: Similar to the seq2seq_model_rl.py code but it has differences in: - loss function - INPUTS: vocab_size: size of vocabulary buckets: a list of pairs (I,O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Traning instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted. ** We may not use bucketing for Dialogue. layer_size: the number of units in each layer num_layers: the number of the layers in the model max_gradient_norm : gradients will be clipped to maximally this norm? candidate_size : the number of candidates (actions) learning_rate : learning rate to start with. learning_rate_decay_factor : decay learning rate by this much when needed. use_lstm: True -> LSTM cells, False -> GRU cells num_samples: the number of samples for sampled softmax forward_only : if set, we do not construct the backward pass in the model dtype: the data type to use to store internal variables. """ self.vocab_size = vocab_size self.buckets = buckets self.buckets_back = [(x[1],x[1]) for x in buckets] self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype = dtype) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate*learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.pre_trained_seq2seq = pre_trained_seq2seq self.pre_trained_backward = pre_trained_backward #self.bucket_id = tf.placeholder(tf.int32, shape=(2,), name="bucket_id") # [bucket_id, 0] self.bucket_id = 0 # Variables w_t = tf.get_variable("proj_w",[self.vocab_size, layer_size], dtype = dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.vocab_size], dtype=dtype) output_projection = (w,b) if use_lstm: single_cell = core_rnn_cell.BasicLSTMCell(layer_size) else: single_cell = core_rnn_cell.GRUCell(layer_size) if num_layers > 1: cell = core_rnn_cell.MultiRNNCell([single_cell]*num_layers) else: cell = single_cell def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols = vocab_size, num_decoder_symbols=vocab_size, embedding_size = layer_size, output_projection = output_projection, feed_previous = do_decode, dtype = dtype) self.states, self.states_back, self.action_dums = [], [], [] # states_back : the 2nd half of the states (each) self.actions , self.actions_back = [], [] self.weights, self.weights_back = [],[] for i in xrange(self.buckets[-1][0]): self.states.append(tf.placeholder(tf.int32, shape=[None], name ="state{0}".format(i))) for i in xrange(self.buckets[-1][1]): self.action_dums.append(tf.placeholder(tf.int32, shape=[None], name ="action_dum{0}".format(i))) self.actions.append(tf.placeholder(tf.int32, shape=[None], name ="action{0}".format(i))) self.weights.append(tf.placeholder(dtype, shape=[None], name ="weight_rl{0}".format(i))) for i in xrange(self.buckets_back[-1][0]): self.actions_back.append(tf.placeholder(tf.int32, shape=[None], name ="action_back{0}".format(i))) for i in xrange(self.buckets_back[-1][1]): self.states_back.append(tf.placeholder(tf.int32, shape=[None], name ="state_back{0}".format(i))) for i in xrange(self.buckets[-1][1]): self.weights_back.append(tf.placeholder(dtype, shape=[None], name="weight_rl_back{0}".format(i))) # 1. Get batch actions #>>self.actions, self.actions_back, self.weights, self.joint_logits = self.generate_batch_action(self.states, self.action_dums, self.bucket_id, lambda x,y:seq2seq_f(x,y,True), output_projection= output_projection) self.actions_sam, self.logprob = self.generate_batch_action(self.states, self.action_dums, self.bucket_id, lambda x,y:seq2seq_f(x,y,True), output_projection= output_projection) # 2. Get the loss def mi_score(states, actions, weights, states_back, actions_back, weights_back): """ Args # states, states_back, weights_back : placeholder # actions, actions_back, weights : from generate_batch_action """ #self.feeding_data(self.pre_trained_seq2seq, self.buckets, states, actions, weights) #self.feeding_data(self.pre_trained_backward, self.buckets_back, actions_back, states_back, weights_back) #output_logits = tf.slice(tf.constant(output_logits, dtype=tf.float32), self.bucket_id, [1,-1]) # if self.bucket_id < (len(self.buckets)-1): # for i in xrange(self.buckets[-1][1]-self.buckets[self.bucket_id][1]): # actions.append(tf.placeholder(tf.int32, shape=[None], name="action{0}".format(i+self.buckets[self.bucket_id][1]))) # weights.append(tf.placeholder(tf.int32, shape=[None], name="weight_rl{0}".format(i+self.buckets[self.bucket_id][1]))) # with tf.variable_scope("forward", reuse=True) as scope: # scope.reuse_variables() # output_logits,_ = tf.contrib.legacy_seq2seq.model_with_buckets(states, actions, actions[0:],weights, self.buckets, lambda x,y: self.pre_trained_seq2seq.seq2seq_f(x,y,True), softmax_loss_function=self.pre_trained_seq2seq.softmax_loss_function) output_logits = self.pre_trained_seq2seq.outputs[self.bucket_id] #output_logprob = [-tf.log(tf.ones(shape = (self.batch_size, self.vocab_size), dtype=tf.float32) + tf.exp(-logit)) for logit in output_logits] log_prob = [] logprob_s2s = tf.nn.log_softmax(output_logits,dim=0) for word_idx in xrange(self.buckets[self.bucket_id][1]): one_hot_mat = tf.one_hot(actions[word_idx],depth=self.vocab_size, on_value = 1.0, off_value=0.0, axis =1, dtype=tf.float32 ) tmp1 = tf.reshape(tf.slice(logprob_s2s, [word_idx,0,0],[1,-1,-1]), shape = (self.batch_size, self.vocab_size)) log_prob_word = tf.subtract(tf.reduce_sum(tf.multiply(tmp1 , one_hot_mat),1), tf.log(tf.reduce_sum(tf.exp(tmp1),1))) log_prob.append(tf.multiply(log_prob_word, weights[word_idx])) output_logits_back = self.pre_trained_backward.outputs[self.bucket_id] #output_logprob_back = [-tf.log(tf.ones(shape = (self.batch_size, self.vocab_size), dtype=tf.float32) + tf.exp(-logit)) for logit in output_logits_back] log_prob_back = [] logprob_back = tf.nn.log_softmax(output_logits_back,dim=0) w_back_new = [np.ones(self.batch_size, dtype = np.float32)] + weights_back[:-1] for word_idx in xrange(self.buckets_back[self.bucket_id][1]): one_hot_mat = tf.one_hot(states_back[word_idx],depth=self.vocab_size, on_value = 1.0, off_value=0.0, axis =1, dtype=tf.float32 ) tmp2 = tf.reshape(tf.slice(logprob_back, [word_idx,0,0],[1,-1,-1]), shape = (self.batch_size, self.vocab_size)) log_prob_word = tf.subtract(tf.reduce_sum(tf.multiply(tmp2 , one_hot_mat),1), tf.log(tf.reduce_sum(tf.exp(tmp2),1))) log_prob_back.append(tf.multiply(log_prob_word, w_back_new[word_idx])) return tf.divide(tf.add_n(log_prob), tf.add_n(weights[:self.buckets[self.bucket_id][1]])) + tf.divide(tf.add_n(log_prob_back), tf.add_n(w_back_new[:self.buckets_back[self.bucket_id][1]])) #+ tf.constant(20.0, shape=(self.batch_size,), dtype = tf.float32) if not forward_only: self.neg_penalty = tf.placeholder(tf.float32, shape=[None], name="neg_penalty") #repeat_penalty(self.actions) self.reward = mi_score(self.states, self.actions, self.weights, self.states_back, self.actions_back, self.weights_back) + tf.scalar_mul(tf.constant(0.05,shape=()), tf.add_n(self.weights[:self.buckets[self.bucket_id][1]])) joint_logprob = tf.reduce_sum(self.logprob,axis=0) # 3. Gradient Descent Optimization params = [x for x in tf.trainable_variables() if "mi" in str(x.name).split("/")] cost = tf.scalar_mul(tf.constant(-1.0,shape=()), tf.add(self.neg_penalty, self.reward)) #tf.add(self.neg_penalty, self.reward) opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(tf.matmul(tf.reshape(cost, shape=(self.batch_size,1)), tf.reshape(joint_logprob,shape=(self.batch_size,1)), transpose_a=True), params) clipped_gradients, global_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) # Clips values of multiple tensors by the ratio of the sum of their norms. self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) #An Operation that applies the specified gradients. If global_step was not None, that operation also increments global_step. self.names = {str(x.name).split(":0")[0] : x for x in tf.global_variables() if 'mi' in str(x.name).split("/")} self.saver = tf.train.Saver(self.names)
def __init__(self, vocab_size, buckets, layer_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, MI_use=False, forward_only=False, dtype=tf.float32): """Create a Model: Similar to the seq2seq_model.py code in the tensorflow version 0.12.1 INPUTS: vocab_size: size of vocabulary buckets: a list of pairs (I,O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Traning instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted. ** We may not use bucketing for Dialogue. layer_size: the number of units in each layer num_layers: the number of the layers in the model max_gradient_norm : gradients will be clipped to maximally this norm? batch_size : the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate : learning rate to start with. learning_rate_decay_factor : decay learning rate by this much when needed. use_lstm: True -> LSTM cells, False -> GRU cells num_samples: the number of samples for sampled softmax forward_only : if set, we do not construct the backward pass in the model dtype: the data type to use to store internal variables. """ self.vocab_size = vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.vocab_size: w_t = tf.get_variable("proj_w", [self.vocab_size, layer_size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss( labels, inputs ): # The order is opposite to the order in 0.12.x version!!! What the hell? labels = tf.reshape(labels, [-1, 1]) # -1 makes it 1-D. # We need to compute the sampled_softmax_loss using 32bit flotas to avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) # tf.nn -> <module 'tensorflow.python.ops.nn' from 'PATH/tensorflow/python/ops/nn.pyc'> return tf.cast( tf.nn.sampled_softmax_loss(weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_samples, num_classes=self.vocab_size), dtype) softmax_loss_function = sampled_loss self.softmax_loss_function = softmax_loss_function # Create the internal multi-layer cell for our RNN. if use_lstm: single_cell = core_rnn_cell.BasicLSTMCell(layer_size) else: single_cell = core_rnn_cell.GRUCell(layer_size) if num_layers > 1: cell = core_rnn_cell.MultiRNNCell([single_cell] * num_layers) else: cell = single_cell # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=vocab_size, num_decoder_symbols=vocab_size, embedding_size=layer_size, output_projection=output_projection, feed_previous=do_decode, dtype=dtype) self.seq2seq_f = seq2seq_f # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format( i))) # "encoder{0}".format(N) -> 'encoderN' for i in xrange(buckets[-1][1] + 1): # For EOS self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # (i+1) because of GO symbol at the beginning # Training outputs and losses (a list(len(buckets) of 1-D batched size tensors) if forward_only: self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) params = tf.trainable_variables( ) # Returns all variables created with trainable=True if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, global_norm = tf.clip_by_global_norm( gradients, max_gradient_norm ) # Clips values of multiple tensors by the ratio of the sum of their norms. self.gradient_norms.append(global_norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) ) #An Operation that applies the specified gradients. If global_step was not None, that operation also increments global_step. if MI_use: self.names = { str(x.name).split(":0")[0]: x for x in tf.global_variables() if 'forward' in str(x.name).split("/") } self.saver = tf.train.Saver(self.names) else: self.saver = tf.train.Saver(tf.global_variables())
def __init__(self, sess, pre_trained_seq2seq, pre_trained_backward, vocab_size, buckets, layer_size, num_layers, max_gradient_norm, candidate_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only = False, dtype= tf.float32): """Create a Model: Similar to the seq2seq_model_rl.py code but it has differences in: - loss function - INPUTS: vocab_size: size of vocabulary buckets: a list of pairs (I,O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Traning instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted. ** We may not use bucketing for Dialogue. layer_size: the number of units in each layer num_layers: the number of the layers in the model max_gradient_norm : gradients will be clipped to maximally this norm? candidate_size : the number of candidates (actions) learning_rate : learning rate to start with. learning_rate_decay_factor : decay learning rate by this much when needed. use_lstm: True -> LSTM cells, False -> GRU cells num_samples: the number of samples for sampled softmax forward_only : if set, we do not construct the backward pass in the model dtype: the data type to use to store internal variables. """ self.sess = sess self.vocab_size = vocab_size self.buckets = buckets self.buckets_back = [(x[1],x[1]) for x in buckets] self.batch_size = """? necessary?""" self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype = dtype) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate*learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.pre_trained_seq2seq = pre_trained_seq2seq self.pre_trained_backward = pre_trained_backward self.bucket_id = len(buckets)-1 if num_samples > 0 and num_samples < self.vocab_size: w_t = tf.get_variable("proj_w_mi",[self.vocab_size, layer_size], dtype = dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b_mi", [self.vocab_size], dtype=dtype) output_projection = (w,b) """ def mi_score(states, actions, weights, states_back, actions_back, weights_back, bucket_id): #Args: # states:[first utterance, second utterance] # actions: action utterance pdb.set_trace() #bucket_id = min([b for b in xrange(len(self.buckets)) if self.buckets[b][0] > len(states)]) states_input = self.sess.run() _, _, output_logits = self.pre_trained_seq2seq.step(self.sess, states, actions, weights, bucket_id, True) # output_logits: log_prob = [] for word_idx in xrange(len(actions)): tmp = [output_logits[word_idx][batch_idx][actions[word_idx][batch_idx]] - np.log(sum(np.exp(output_logits[word_idx][batch_idx]))) for batch_idx in xrange(batch_size)] log_prob.append(np.inner(tmp, weights[word_idx])) #bucket_id_back = min([b for b in xrange(len(self.buckets_back)) if self.buckets_back[b][0] > len(states_back)]) _, _, output_logits_back = self.pre_trained_backward.step(self.sess, actions_back, states_back, weights_back, bucket_id, True) log_prob_back = [] for word_idx in xrange(len(states_back)): tmp = [output_logits_back[word_idx][batch_idx][states_back[word_idx][batch_idx]] - np.log(sum(np.exp(output_logits_back[word_idx][batch_idx]))) for batch_idx in xrange(batch_size)] log_prob_back.append(np.inner(tmp, weights_back[word_idx])) # -log_prob/float(len(action)) - log_prob_back/float(len(state[1])) return -sum(log_prob)/float(len(actions)) - log_prob_back/float(len(states_back)) loss_function = mi_score """ if use_lstm: single_cell = core_rnn_cell.BasicLSTMCell(layer_size) else: single_cell = core_rnn_cell.GRUCell(layer_size) if num_layers > 1: cell = core_rnn_cell.MultiRNNCell([single_cell]*num_layers) else: cell = single_cell def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols = vocab_size, num_decoder_symbols=vocab_size, embedding_size = layer_size, output_projection = output_projection, feed_previous = do_decode, dtype = dtype) self.seq2seq_f = seq2seq_f self.states, self.states_back = [], [] self.actions , self.actions_back = [], [] self.weights, self.weights_back = [], [] for i in xrange(self.buckets[-1][0]): self.states.append(tf.placeholder(tf.int32, shape=[None], name ="state{0}".format(i))) for i in xrange(self.buckets_back[-1][1]): self.states_back.append(tf.placeholder(tf.int32, shape=[None], name ="state_back{0}".format(i))) for i in xrange(self.buckets[-1][1]): self.actions.append(tf.placeholder(tf.int32, shape=[None], name ="action{0}".format(i))) self.actions_back.append(tf.placeholder(tf.int32, shape=[None], name ="action_back{0}".format(i))) self.weights.append(tf.placeholder(dtype, shape=[None], name="weight_rl{0}".format(i))) self.weights_back.append(tf.placeholder(dtype, shape=[None], name="weight_rl_back{0}".format(i))) #self.losses = loss_function(self.states, self.actions, self.weights, self.states_back, self.actions_back, self.weights_back, self.bucket_id) self.losses = [] for i in xrange(len(buckets)): self.losses.append(tf.placeholder(tf.float32, shape = [None], name = "losses{0}".format(i))) params = tf.trainable_variables() pdb.set_trace() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b],params) clipped_gradients, global_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) # Clips values of multiple tensors by the ratio of the sum of their norms. self.gradient_norms.append(global_norm) self.updates.append(opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) #An Operation that applies the specified gradients. If global_step was not None, that operation also increments global_step. #self.updates.append(opt.minimize(self.losses[b],params)) self.saver = tf.train.Saver(tf.global_variables())
def _construct_graph(self): """Construct Tensorflow graph.""" self.graph = tf.Graph() hidden_state_size = self.hidden_size if self.net_type == 'brnn': hidden_state_size *= 2 with self.graph.as_default(): self.words = tf.placeholder(tf.int32, shape=(None, None), name='words') self.syllable_labels = tf.placeholder(tf.int32, shape=(None, None), name='syllable_labels') self.seq_lengths = tf.placeholder(tf.int32, shape=(None,), name='lengths') batch_size = tf.shape(self.words)[0] W = tf.Variable(tf.truncated_normal([hidden_state_size, 2]), dtype=tf.float32) b = tf.Variable(np.zeros([2]), dtype=tf.float32) embedding_matrix = tf.Variable(tf.truncated_normal( [len(self.mapping), self.hidden_size], stddev=np.sqrt(2.0 / self.hidden_size ))) embedding = tf.nn.embedding_lookup(embedding_matrix, self.words) treshold = tf.Variable(np.array([self.treshold]), dtype=tf.float32, name='treshold') self.num_syllables = tf.reduce_sum(tf.cast(self.syllable_labels, tf.float32) * tf.sequence_mask(self.seq_lengths, tf.reduce_max(self.seq_lengths), dtype=tf.float32), 1) if self.cell_type == 'lstm': cell_constructor = rnn_cell.LSTMCell elif self.cell_type == 'gru': cell_constructor = rnn_cell.GRUCell elif self.cell_type == 'block_lstm': cell_constructor = tf.contrib.rnn.LSTMBlockCell else: raise ValueError('Unknown cell type.') fw_multicell = rnn_cell.MultiRNNCell([cell_constructor(self.hidden_size) for i in range(self.num_layers)]) bw_multicell = rnn_cell.MultiRNNCell([cell_constructor(self.hidden_size) for i in range(self.num_layers)]) if self.net_type == 'rnn': self.outputs, _ = dynamic_rnn(rnn_multicell, embedding, sequence_length=self.seq_lengths, dtype=tf.float32, swap_memory=True) elif self.net_type == 'brnn': self.outputs, _ = dynamic_brnn(fw_multicell, bw_multicell, embedding, sequence_length=self.seq_lengths, dtype=tf.float32, swap_memory=True) self.outputs = tf.concat(self.outputs, 2) outputs_reshape = tf.reshape(self.outputs, [-1, hidden_state_size]) logits = tf.matmul(outputs_reshape, W) + b self.logits = tf.reshape(logits, [batch_size, -1, 2]) probs = tf.nn.softmax(self.logits) # probabilities only for positive class: self.sliced_probs = tf.slice(probs, [0, 0, 1], [-1, -1, -1]) self.sliced_probs = tf.squeeze(self.sliced_probs, axis=2) greater = tf.greater(self.sliced_probs, treshold) self.separation_indices = tf.where(greater) self.prediction = tf.zeros_like(greater, dtype=tf.float32) unmasked_ce = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits, labels=self.syllable_labels) mask = tf.sequence_mask(self.seq_lengths, tf.reduce_max(self.seq_lengths), dtype=tf.float32) self.loss = tf.reduce_sum(unmasked_ce * mask) / tf.reduce_sum(mask) self.optimizer = tf.train.AdamOptimizer().minimize(self.loss) self.saver = tf.train.Saver()