Beispiel #1
0
    def __init__(self, args, infer=False):
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        if args.model == 'rnn':
            cell_fn = core_rnn_cell.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = core_rnn_cell.GRUCell
        elif args.model == 'lstm':
            cell_fn = core_rnn_cell.BasicLSTMCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cell = cell_fn(args.rnn_size, state_is_tuple=True)

        self.cell = cell = core_rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) 

        self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name="input_data")
        self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name="targets")
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)

        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])
            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size])
                print "seq_length = ", args.seq_length, "embedding_lookup = ", tf.nn.embedding_lookup(embedding, self.input_data)
                #inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data))
                inputs = tf.split( tf.nn.embedding_lookup(embedding, self.input_data)  , args.seq_length,1)
                print "inputs 1:",inputs
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
                print "inputs 2:",inputs
        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        # yonghua
        # inputs, initial_state, cell, scope
        outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm')
        #sys.stdout.write("outputs : %s\tlast_state : %s" % (outputs, last_state))
        #output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
        output = tf.reshape(tf.concat(outputs,1), [-1, args.rnn_size])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits, name="prob_results")
        loss = seq2seq.sequence_loss_by_example([self.logits],
                [tf.reshape(self.targets, [-1])],
                [tf.ones([args.batch_size * args.seq_length])],
                args.vocab_size)
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False,name="LR_")
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
    def __init__(self, vocabularySize, config_param):
        self.vocabularySize = vocabularySize
        self.config = config_param

        self._inputX = tf.placeholder(
            tf.int32, [self.config.batch_size, self.config.sequence_size],
            "InputsX")
        self._inputTargetsY = tf.placeholder(
            tf.int32, [self.config.batch_size, self.config.sequence_size],
            "InputTargetsY")

        #Converting Input in an Embedded form
        with tf.device(
                "/cpu:0"):  #Tells Tensorflow what GPU to use specifically
            embedding = tf.get_variable(
                "embedding", [self.vocabularySize, self.config.embeddingSize])
            embeddingLookedUp = tf.nn.embedding_lookup(embedding, self._inputX)
            inputs = tf.split(axis=1,
                              num_or_size_splits=self.config.sequence_size,
                              value=embeddingLookedUp)
            inputTensorsAsList = [tf.squeeze(input_, [1]) for input_ in inputs]

        #Define Tensor RNN
        singleRNNCell = rnn_cell.BasicRNNCell(self.config.hidden_size)
        self.multilayerRNN = rnn_cell.MultiRNNCell([singleRNNCell] *
                                                   self.config.num_layers)
        self._initial_state = self.multilayerRNN.zero_state(
            self.config.batch_size, tf.float32)

        #Defining Logits
        hidden_layer_output, last_state = rnn.static_rnn(
            self.multilayerRNN,
            inputTensorsAsList,
            initial_state=self._initial_state)
        hidden_layer_output = tf.reshape(
            tf.concat(axis=1, values=hidden_layer_output),
            [-1, self.config.hidden_size])
        self._logits = tf.nn.xw_plus_b(
            hidden_layer_output,
            tf.get_variable("softmax_w",
                            [self.config.hidden_size, self.vocabularySize]),
            tf.get_variable("softmax_b", [self.vocabularySize]))
        self._predictionSoftmax = tf.nn.softmax(self._logits)

        #Define the loss
        loss = seq2seq.sequence_loss_by_example(
            [self._logits], [tf.reshape(self._inputTargetsY, [-1])],
            [tf.ones([self.config.batch_size * self.config.sequence_size])],
            self.vocabularySize)
        self._cost = tf.div(tf.reduce_sum(loss), self.config.batch_size)

        self._final_state = last_state
Beispiel #3
0
 def build_input_sequence(self, gpu_id=0):
     #embedding layer
     self.__build_embedding_layer__()
     with get_new_variable_scope('rnn_lstm') as rnn_scope:
         single_cell = rnn_cell.LSTMCell(self.hidden_size,
                                         use_peepholes=True,
                                         state_is_tuple=True)
         single_cell = rnn_cell.DropoutWrapper(
             single_cell,
             input_keep_prob=self.keep_prob,
             output_keep_prob=self.keep_prob)
         cell = rnn_cell.MultiRNNCell([single_cell] * self.num_layers,
                                      state_is_tuple=True)
         self.state_list[gpu_id], self.output_list[gpu_id] = dynamic_rnn(
             cell,
             self.input_embedding,
             self.split_seqLengths[gpu_id],
             dtype=tf.float32)
     if self.input_params is None:
         self.input_params = tf.trainable_variables()[1:]
Beispiel #4
0
 def get_dec_cell(self, cell_size):
     cell = core_rnn_cell.GRUCell(cell_size)
     # TODO
     if True:
         num_layers = 2
         '''
   if self.phase_train:
     cell = core_rnn_cell.DropoutWrapper(
         cell, input_keep_prob=0.5)
   '''
         cell = core_rnn_cell.MultiRNNCell([cell] * num_layers)
         '''
   if self.phase_train:
     cell = core_rnn_cell.DropoutWrapper(
         cell, output_keep_prob=0.5)
   '''
     else:
         if self.phase_train:
             cell = core_rnn_cell.DropoutWrapper(cell,
                                                 input_keep_prob=0.5,
                                                 output_keep_prob=0.5)
     return cell
Beispiel #5
0
    def __init__(self, is_training, config, input_):
        self._input = input_

        batch_size = input_.batch_size
        num_steps = input_.num_steps
        size = config.hidden_size
        vocab_size = config.vocab_size

        # Slightly better results can be obtained with forget gate biases
        # initialized to 1 but the hyperparameters of the model would need to be
        # different than reported in the paper.
        lstm_cell = core_rnn_cell.BasicLSTMCell(num_units=size,
                                                state_is_tuple=True)
        if is_training and config.keep_prob < 1:
            lstm_cell = tf.contrib.rnn.DropoutWrapper(
                lstm_cell, output_keep_prob=config.keep_prob)
        cell = core_rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers,
                                          state_is_tuple=True)

        self._initial_state = cell.zero_state(batch_size, data_type())

        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, size],
                                        dtype=data_type())
            inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        outputs = []
        state = self._initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)

        output = tf.reshape(tf.concat(outputs, 1), [-1, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size],
                                    dtype=data_type())
        softmax_b = tf.get_variable("softmax_b", [vocab_size],
                                    dtype=data_type())
        logits = tf.matmul(output, softmax_w) + softmax_b
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(input_.targets, [-1])],
            [tf.ones([batch_size * num_steps], dtype=data_type())])
        self._cost = cost = tf.reduce_sum(loss) / batch_size
        self._final_state = state

        if not is_training:
            return

        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        self._new_lr = tf.placeholder(tf.float32,
                                      shape=[],
                                      name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)
Beispiel #6
0
	def __init__(self, pre_trained_seq2seq, pre_trained_backward, vocab_size, buckets, layer_size, num_layers, 
		max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, 
		use_lstm=False, num_samples=512, forward_only = False, dtype= tf.float32):

		"""Create a Model:
		Similar to the seq2seq_model_rl.py code but it has differences in:
		- loss function
		-
		INPUTS:
			vocab_size: size of vocabulary
			buckets: a list of pairs (I,O), where I specifies maximum input length that 
				will be processed in that bucket, and O specifies maximum output length. Traning 
				instances that have inputs longer than I or outputs longer than O will be pushed 
				to the next bucket and padded accordingly. We assume that the list is sorted.
				** We may not use bucketing for Dialogue.
			layer_size: the number of units in each layer
			num_layers: the number of the layers in the model
			max_gradient_norm : gradients will be clipped to maximally this norm?
			candidate_size : the number of candidates (actions)
			learning_rate : learning rate to start with.
			learning_rate_decay_factor : decay learning rate by this much when needed.
			use_lstm: True -> LSTM cells, False -> GRU cells
			num_samples: the number of samples for sampled softmax
			forward_only : if set, we do not construct the backward pass in the model
			dtype: the data type to use to store internal variables.
		"""
		self.vocab_size = vocab_size
		self.buckets = buckets
		self.buckets_back = [(x[1],x[1]) for x in buckets]
		self.batch_size = batch_size
		self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype = dtype)
		self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate*learning_rate_decay_factor)
		self.global_step = tf.Variable(0, trainable=False)
		self.pre_trained_seq2seq = pre_trained_seq2seq
		self.pre_trained_backward = pre_trained_backward
		#self.bucket_id = tf.placeholder(tf.int32, shape=(2,), name="bucket_id") # [bucket_id, 0]
		self.bucket_id = 0
		# Variables
		
		w_t = tf.get_variable("proj_w",[self.vocab_size, layer_size], dtype = dtype)
		w = tf.transpose(w_t)
		b = tf.get_variable("proj_b", [self.vocab_size], dtype=dtype)
		output_projection = (w,b)

		if use_lstm:
			single_cell = core_rnn_cell.BasicLSTMCell(layer_size)
		else:
			single_cell = core_rnn_cell.GRUCell(layer_size)

		if num_layers > 1:
			cell = core_rnn_cell.MultiRNNCell([single_cell]*num_layers)
		else:
			cell = single_cell

		def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
			return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
			 num_encoder_symbols = vocab_size, num_decoder_symbols=vocab_size, embedding_size = layer_size, 
			 output_projection = output_projection, feed_previous = do_decode, dtype = dtype)
		
		self.states, self.states_back, self.action_dums = [], [], [] # states_back : the 2nd half of the states (each)
		self.actions , self.actions_back = [], []
		self.weights, self.weights_back = [],[]

		for i in xrange(self.buckets[-1][0]):
			self.states.append(tf.placeholder(tf.int32, shape=[None], name ="state{0}".format(i)))

		for i in xrange(self.buckets[-1][1]):
			self.action_dums.append(tf.placeholder(tf.int32, shape=[None], name ="action_dum{0}".format(i)))
			self.actions.append(tf.placeholder(tf.int32, shape=[None], name ="action{0}".format(i)))
			self.weights.append(tf.placeholder(dtype, shape=[None], name ="weight_rl{0}".format(i)))

		for i in xrange(self.buckets_back[-1][0]):
			self.actions_back.append(tf.placeholder(tf.int32, shape=[None], name ="action_back{0}".format(i)))

		for i in xrange(self.buckets_back[-1][1]):
			self.states_back.append(tf.placeholder(tf.int32, shape=[None], name ="state_back{0}".format(i)))

		for i in xrange(self.buckets[-1][1]):
			self.weights_back.append(tf.placeholder(dtype, shape=[None], name="weight_rl_back{0}".format(i)))

		# 1. Get batch actions 
		#>>self.actions, self.actions_back, self.weights, self.joint_logits = self.generate_batch_action(self.states, self.action_dums, self.bucket_id, lambda x,y:seq2seq_f(x,y,True), output_projection= output_projection)
		self.actions_sam, self.logprob = self.generate_batch_action(self.states, self.action_dums, self.bucket_id, lambda x,y:seq2seq_f(x,y,True), output_projection= output_projection)
		# 2. Get the loss
		def mi_score(states, actions, weights, states_back, actions_back, weights_back):
			
			"""
			Args
			#	states, states_back, weights_back : placeholder
			#	actions, actions_back, weights : from generate_batch_action 
			"""
			#self.feeding_data(self.pre_trained_seq2seq, self.buckets, states, actions, weights)
			#self.feeding_data(self.pre_trained_backward, self.buckets_back, actions_back, states_back, weights_back)

			#output_logits = tf.slice(tf.constant(output_logits, dtype=tf.float32), self.bucket_id, [1,-1])
			
			# if self.bucket_id < (len(self.buckets)-1):
			# 	for i in xrange(self.buckets[-1][1]-self.buckets[self.bucket_id][1]):
			# 		actions.append(tf.placeholder(tf.int32, shape=[None], name="action{0}".format(i+self.buckets[self.bucket_id][1])))
			# 		weights.append(tf.placeholder(tf.int32, shape=[None], name="weight_rl{0}".format(i+self.buckets[self.bucket_id][1])))
			# with tf.variable_scope("forward", reuse=True) as scope:
			# 	scope.reuse_variables()
			# 	output_logits,_ = tf.contrib.legacy_seq2seq.model_with_buckets(states, actions, actions[0:],weights, self.buckets, lambda x,y: self.pre_trained_seq2seq.seq2seq_f(x,y,True), softmax_loss_function=self.pre_trained_seq2seq.softmax_loss_function)
			
			output_logits = self.pre_trained_seq2seq.outputs[self.bucket_id]
			#output_logprob = [-tf.log(tf.ones(shape = (self.batch_size, self.vocab_size), dtype=tf.float32) + tf.exp(-logit)) for logit in output_logits]
			log_prob = []
			logprob_s2s = tf.nn.log_softmax(output_logits,dim=0)

			for word_idx in xrange(self.buckets[self.bucket_id][1]):
				one_hot_mat = tf.one_hot(actions[word_idx],depth=self.vocab_size, on_value = 1.0, off_value=0.0, axis =1, dtype=tf.float32 )	
				tmp1 = tf.reshape(tf.slice(logprob_s2s, [word_idx,0,0],[1,-1,-1]), shape = (self.batch_size, self.vocab_size))
				log_prob_word = tf.subtract(tf.reduce_sum(tf.multiply(tmp1 , one_hot_mat),1), tf.log(tf.reduce_sum(tf.exp(tmp1),1)))
				log_prob.append(tf.multiply(log_prob_word, weights[word_idx]))
			
			output_logits_back = self.pre_trained_backward.outputs[self.bucket_id]
			#output_logprob_back = [-tf.log(tf.ones(shape = (self.batch_size, self.vocab_size), dtype=tf.float32) + tf.exp(-logit)) for logit in output_logits_back]
			log_prob_back = []
			logprob_back = tf.nn.log_softmax(output_logits_back,dim=0)
			w_back_new = [np.ones(self.batch_size, dtype = np.float32)] + weights_back[:-1]
			
			for word_idx in xrange(self.buckets_back[self.bucket_id][1]):
				one_hot_mat = tf.one_hot(states_back[word_idx],depth=self.vocab_size, on_value = 1.0, off_value=0.0, axis =1, dtype=tf.float32 )	
				tmp2 = tf.reshape(tf.slice(logprob_back, [word_idx,0,0],[1,-1,-1]), shape = (self.batch_size, self.vocab_size))
				log_prob_word = tf.subtract(tf.reduce_sum(tf.multiply(tmp2 , one_hot_mat),1), tf.log(tf.reduce_sum(tf.exp(tmp2),1)))
				log_prob_back.append(tf.multiply(log_prob_word, w_back_new[word_idx]))
			
			return tf.divide(tf.add_n(log_prob), tf.add_n(weights[:self.buckets[self.bucket_id][1]])) + tf.divide(tf.add_n(log_prob_back), tf.add_n(w_back_new[:self.buckets_back[self.bucket_id][1]])) #+ tf.constant(20.0, shape=(self.batch_size,), dtype = tf.float32)
		
		if not forward_only:
			self.neg_penalty = tf.placeholder(tf.float32, shape=[None], name="neg_penalty") #repeat_penalty(self.actions)
			self.reward =  mi_score(self.states, self.actions, self.weights, self.states_back, self.actions_back, self.weights_back) + tf.scalar_mul(tf.constant(0.05,shape=()), tf.add_n(self.weights[:self.buckets[self.bucket_id][1]]))
			joint_logprob = tf.reduce_sum(self.logprob,axis=0)
			# 3. Gradient Descent Optimization
			params = [x for x in tf.trainable_variables() if "mi" in str(x.name).split("/")]
			cost = tf.scalar_mul(tf.constant(-1.0,shape=()), tf.add(self.neg_penalty, self.reward)) #tf.add(self.neg_penalty, self.reward)
			opt = tf.train.GradientDescentOptimizer(self.learning_rate)
			gradients = tf.gradients(tf.matmul(tf.reshape(cost, shape=(self.batch_size,1)), tf.reshape(joint_logprob,shape=(self.batch_size,1)), transpose_a=True), params)
			clipped_gradients, global_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) # Clips values of multiple tensors by the ratio of the sum of their norms.
			self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) #An Operation that applies the specified gradients. If global_step was not None, that operation also increments global_step.

		self.names = {str(x.name).split(":0")[0] : x for x in tf.global_variables() if 'mi' in str(x.name).split("/")}	
		self.saver = tf.train.Saver(self.names)
Beispiel #7
0
    def __init__(self,
                 vocab_size,
                 buckets,
                 layer_size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 MI_use=False,
                 forward_only=False,
                 dtype=tf.float32):
        """Create a Model:
		Similar to the seq2seq_model.py code in the tensorflow version 0.12.1
		INPUTS:
			vocab_size: size of vocabulary
			buckets: a list of pairs (I,O), where I specifies maximum input length that 
				will be processed in that bucket, and O specifies maximum output length. Traning 
				instances that have inputs longer than I or outputs longer than O will be pushed 
				to the next bucket and padded accordingly. We assume that the list is sorted.
				** We may not use bucketing for Dialogue.
			layer_size: the number of units in each layer
			num_layers: the number of the layers in the model
			max_gradient_norm : gradients will be clipped to maximally this norm?
			batch_size : the size of the batches used during training; the model construction
				is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding.
			learning_rate : learning rate to start with.
			learning_rate_decay_factor : decay learning rate by this much when needed.
			use_lstm: True -> LSTM cells, False -> GRU cells
			num_samples: the number of samples for sampled softmax
			forward_only : if set, we do not construct the backward pass in the model
			dtype: the data type to use to store internal variables.
		
		"""
        self.vocab_size = vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=dtype)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        output_projection = None
        softmax_loss_function = None

        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.vocab_size:
            w_t = tf.get_variable("proj_w", [self.vocab_size, layer_size],
                                  dtype=dtype)
            w = tf.transpose(w_t)
            b = tf.get_variable("proj_b", [self.vocab_size], dtype=dtype)
            output_projection = (w, b)

            def sampled_loss(
                labels, inputs
            ):  # The order is opposite to the order in 0.12.x version!!! What the hell?

                labels = tf.reshape(labels, [-1, 1])  # -1 makes it 1-D.
                # We need to compute the sampled_softmax_loss using 32bit flotas to avoid numerical instabilities.
                local_w_t = tf.cast(w_t, tf.float32)
                local_b = tf.cast(b, tf.float32)
                local_inputs = tf.cast(inputs, tf.float32)
                # tf.nn -> <module 'tensorflow.python.ops.nn' from 'PATH/tensorflow/python/ops/nn.pyc'>
                return tf.cast(
                    tf.nn.sampled_softmax_loss(weights=local_w_t,
                                               biases=local_b,
                                               labels=labels,
                                               inputs=local_inputs,
                                               num_sampled=num_samples,
                                               num_classes=self.vocab_size),
                    dtype)

        softmax_loss_function = sampled_loss
        self.softmax_loss_function = softmax_loss_function

        # Create the internal multi-layer cell for our RNN.
        if use_lstm:
            single_cell = core_rnn_cell.BasicLSTMCell(layer_size)
        else:
            single_cell = core_rnn_cell.GRUCell(layer_size)

        if num_layers > 1:
            cell = core_rnn_cell.MultiRNNCell([single_cell] * num_layers)
        else:
            cell = single_cell

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=vocab_size,
                num_decoder_symbols=vocab_size,
                embedding_size=layer_size,
                output_projection=output_projection,
                feed_previous=do_decode,
                dtype=dtype)

        self.seq2seq_f = seq2seq_f
        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(
                                   i)))  # "encoder{0}".format(N) -> 'encoderN'

        for i in xrange(buckets[-1][1] + 1):  # For EOS
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(dtype, shape=[None],
                               name="weight{0}".format(i)))
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]  # (i+1) because of GO symbol at the beginning

        # Training outputs and losses (a list(len(buckets) of 1-D batched size tensors)
        if forward_only:
            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        params = tf.trainable_variables(
        )  # Returns all variables created with trainable=True
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, global_norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm
                )  # Clips values of multiple tensors by the ratio of the sum of their norms.
                self.gradient_norms.append(global_norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step)
                )  #An Operation that applies the specified gradients. If global_step was not None, that operation also increments global_step.

        if MI_use:
            self.names = {
                str(x.name).split(":0")[0]: x
                for x in tf.global_variables()
                if 'forward' in str(x.name).split("/")
            }
            self.saver = tf.train.Saver(self.names)
        else:
            self.saver = tf.train.Saver(tf.global_variables())
Beispiel #8
0
	def __init__(self, sess, pre_trained_seq2seq, pre_trained_backward, vocab_size, buckets, layer_size, num_layers, 
		max_gradient_norm, candidate_size, learning_rate, learning_rate_decay_factor, 
		use_lstm=False, num_samples=512, forward_only = False, dtype= tf.float32):

		"""Create a Model:
		Similar to the seq2seq_model_rl.py code but it has differences in:
		- loss function
		-
		INPUTS:
			vocab_size: size of vocabulary
			buckets: a list of pairs (I,O), where I specifies maximum input length that 
				will be processed in that bucket, and O specifies maximum output length. Traning 
				instances that have inputs longer than I or outputs longer than O will be pushed 
				to the next bucket and padded accordingly. We assume that the list is sorted.
				** We may not use bucketing for Dialogue.
			layer_size: the number of units in each layer
			num_layers: the number of the layers in the model
			max_gradient_norm : gradients will be clipped to maximally this norm?
			candidate_size : the number of candidates (actions)
			learning_rate : learning rate to start with.
			learning_rate_decay_factor : decay learning rate by this much when needed.
			use_lstm: True -> LSTM cells, False -> GRU cells
			num_samples: the number of samples for sampled softmax
			forward_only : if set, we do not construct the backward pass in the model
			dtype: the data type to use to store internal variables.
		"""
		self.sess = sess
		self.vocab_size = vocab_size
		self.buckets = buckets
		self.buckets_back = [(x[1],x[1]) for x in buckets]
		self.batch_size = """? necessary?"""
		self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype = dtype)
		self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate*learning_rate_decay_factor)
		self.global_step = tf.Variable(0, trainable=False)
		self.pre_trained_seq2seq = pre_trained_seq2seq
		self.pre_trained_backward = pre_trained_backward
		self.bucket_id = len(buckets)-1
		if num_samples > 0 and num_samples < self.vocab_size:
			w_t = tf.get_variable("proj_w_mi",[self.vocab_size, layer_size], dtype = dtype)
			w = tf.transpose(w_t)
			b = tf.get_variable("proj_b_mi", [self.vocab_size], dtype=dtype)
			output_projection = (w,b)

			"""
			def mi_score(states, actions, weights, states_back, actions_back, weights_back, bucket_id):
				#Args:
				#	states:[first utterance, second utterance]
				#	actions: action utterance

				pdb.set_trace()
				#bucket_id = min([b for b in xrange(len(self.buckets)) if self.buckets[b][0] > len(states)])
				states_input = self.sess.run()
				_, _, output_logits = self.pre_trained_seq2seq.step(self.sess, states, actions, weights, bucket_id, True)
				# output_logits: 
				log_prob = []
				for word_idx in xrange(len(actions)):
					tmp = [output_logits[word_idx][batch_idx][actions[word_idx][batch_idx]] - np.log(sum(np.exp(output_logits[word_idx][batch_idx]))) for batch_idx in xrange(batch_size)]
					log_prob.append(np.inner(tmp, weights[word_idx]))

				#bucket_id_back = min([b for b in xrange(len(self.buckets_back)) if self.buckets_back[b][0] > len(states_back)])
				_, _, output_logits_back = self.pre_trained_backward.step(self.sess, actions_back, states_back, weights_back, bucket_id, True)

				log_prob_back = []
				for word_idx in xrange(len(states_back)):
					tmp = [output_logits_back[word_idx][batch_idx][states_back[word_idx][batch_idx]] - np.log(sum(np.exp(output_logits_back[word_idx][batch_idx]))) for batch_idx in xrange(batch_size)]
					log_prob_back.append(np.inner(tmp, weights_back[word_idx]))

				# -log_prob/float(len(action)) - log_prob_back/float(len(state[1]))
				return -sum(log_prob)/float(len(actions)) - log_prob_back/float(len(states_back))

			loss_function = mi_score	
			"""

		if use_lstm:
			single_cell = core_rnn_cell.BasicLSTMCell(layer_size)
		else:
			single_cell = core_rnn_cell.GRUCell(layer_size)

		if num_layers > 1:
			cell = core_rnn_cell.MultiRNNCell([single_cell]*num_layers)
		else:
			cell = single_cell

		def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
			return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
			 num_encoder_symbols = vocab_size, num_decoder_symbols=vocab_size, embedding_size = layer_size, 
			 output_projection = output_projection, feed_previous = do_decode, dtype = dtype)
		self.seq2seq_f = seq2seq_f

		self.states, self.states_back = [], []
		self.actions , self.actions_back = [], []
		self.weights, self.weights_back = [], []

		for i in xrange(self.buckets[-1][0]):
			self.states.append(tf.placeholder(tf.int32, shape=[None], name ="state{0}".format(i)))

		for i in xrange(self.buckets_back[-1][1]):
			self.states_back.append(tf.placeholder(tf.int32, shape=[None], name ="state_back{0}".format(i)))

		for i in xrange(self.buckets[-1][1]):
			self.actions.append(tf.placeholder(tf.int32, shape=[None], name ="action{0}".format(i)))
			self.actions_back.append(tf.placeholder(tf.int32, shape=[None], name ="action_back{0}".format(i)))
			self.weights.append(tf.placeholder(dtype, shape=[None], name="weight_rl{0}".format(i)))
			self.weights_back.append(tf.placeholder(dtype, shape=[None], name="weight_rl_back{0}".format(i)))

		#self.losses = loss_function(self.states, self.actions, self.weights, self.states_back, self.actions_back, self.weights_back, self.bucket_id)
		self.losses = []
		for i in xrange(len(buckets)):
			self.losses.append(tf.placeholder(tf.float32, shape = [None], name = "losses{0}".format(i)))

		params = tf.trainable_variables()
		pdb.set_trace()
		if not forward_only:
			self.gradient_norms = []
			self.updates = []
			opt = tf.train.GradientDescentOptimizer(self.learning_rate)
			for b in xrange(len(buckets)):
				gradients = tf.gradients(self.losses[b],params)
				clipped_gradients, global_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) # Clips values of multiple tensors by the ratio of the sum of their norms.
				self.gradient_norms.append(global_norm)
				self.updates.append(opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) #An Operation that applies the specified gradients. If global_step was not None, that operation also increments global_step.
				#self.updates.append(opt.minimize(self.losses[b],params))

		self.saver = tf.train.Saver(tf.global_variables())
    def _construct_graph(self):
        """Construct Tensorflow graph."""
        self.graph = tf.Graph()
        hidden_state_size = self.hidden_size
        if self.net_type == 'brnn':
            hidden_state_size *= 2
        with self.graph.as_default():
            self.words = tf.placeholder(tf.int32, shape=(None, None),
                                        name='words')
            self.syllable_labels = tf.placeholder(tf.int32,
                                                  shape=(None, None),
                                                  name='syllable_labels')
            self.seq_lengths = tf.placeholder(tf.int32, shape=(None,),
                                              name='lengths')
            batch_size = tf.shape(self.words)[0]

            W = tf.Variable(tf.truncated_normal([hidden_state_size, 2]),
                            dtype=tf.float32)
            b = tf.Variable(np.zeros([2]), dtype=tf.float32)
            embedding_matrix = tf.Variable(tf.truncated_normal(
                                           [len(self.mapping),
                                            self.hidden_size],
                                           stddev=np.sqrt(2.0 / self.hidden_size
                                                          )))
            embedding = tf.nn.embedding_lookup(embedding_matrix, self.words)
            treshold = tf.Variable(np.array([self.treshold]), dtype=tf.float32,
                                   name='treshold')
            self.num_syllables = tf.reduce_sum(tf.cast(self.syllable_labels, tf.float32) *
                                               tf.sequence_mask(self.seq_lengths,
                                                                tf.reduce_max(self.seq_lengths),
                                                                dtype=tf.float32), 1)
            if self.cell_type == 'lstm':
                cell_constructor = rnn_cell.LSTMCell
            elif self.cell_type == 'gru':
                cell_constructor = rnn_cell.GRUCell
            elif self.cell_type == 'block_lstm':
                cell_constructor = tf.contrib.rnn.LSTMBlockCell
            else:
                raise ValueError('Unknown cell type.')
            fw_multicell = rnn_cell.MultiRNNCell([cell_constructor(self.hidden_size) for i in range(self.num_layers)])
            bw_multicell = rnn_cell.MultiRNNCell([cell_constructor(self.hidden_size) for i in range(self.num_layers)])

            if self.net_type == 'rnn':
                self.outputs, _ = dynamic_rnn(rnn_multicell, embedding,
                                              sequence_length=self.seq_lengths,
                                              dtype=tf.float32,
                                              swap_memory=True)
            elif self.net_type == 'brnn':
                self.outputs, _ = dynamic_brnn(fw_multicell, bw_multicell,
                                               embedding,
                                               sequence_length=self.seq_lengths,
                                               dtype=tf.float32,
                                               swap_memory=True)
                self.outputs = tf.concat(self.outputs, 2)
            outputs_reshape = tf.reshape(self.outputs, [-1, hidden_state_size])
            logits = tf.matmul(outputs_reshape, W) + b
            self.logits = tf.reshape(logits, [batch_size, -1, 2])
            probs = tf.nn.softmax(self.logits)
            # probabilities only for positive class:
            self.sliced_probs = tf.slice(probs, [0, 0, 1], [-1, -1, -1])
            self.sliced_probs = tf.squeeze(self.sliced_probs, axis=2)
            greater = tf.greater(self.sliced_probs, treshold)
            self.separation_indices = tf.where(greater)
            self.prediction = tf.zeros_like(greater, dtype=tf.float32)
            unmasked_ce = tf.nn.sparse_softmax_cross_entropy_with_logits(
                                        logits=self.logits, labels=self.syllable_labels)
            mask = tf.sequence_mask(self.seq_lengths,
                                    tf.reduce_max(self.seq_lengths),
                                    dtype=tf.float32)
            self.loss = tf.reduce_sum(unmasked_ce * mask) / tf.reduce_sum(mask)
            self.optimizer = tf.train.AdamOptimizer().minimize(self.loss)
            self.saver = tf.train.Saver()