def recurrent_neural_network(train_input):
    layer = {
        'weights': tf.Variable(tf.random_normal([n_hidden, n_classes])),
        'biases': tf.Variable(tf.random_normal([n_classes]))
    }

    train_input = tf.unstack(train_input, seq_max_len, 1)
    print(train_input)
    lstm_cell = core_rnn_cell.BasicLSTMCell(rnn_size)
    outputs, states = rnn.static_rnn(lstm_cell,
                                     train_input,
                                     dtype=tf.float32,
                                     sequence_length=seqlen)
    outputs = tf.stack(outputs)
    outputs = tf.transpose(outputs, [1, 0, 2])

    # Hack to build the indexing and retrieve the right output.
    batch_size = tf.shape(outputs)[0]
    # Start indices for each sample
    index = tf.range(0, batch_size) * seq_max_len + (seqlen - 1)
    # Indexing
    outputs = tf.gather(tf.reshape(outputs, [-1, n_hidden]), index)

    output = tf.matmul(outputs, layer['weights']) + layer['biases']
    # print(output)
    return output
Exemple #2
0
def recurrent_neural_network(train_input):
    layer = {
        'weights': tf.Variable(tf.random_normal([rnn_size, n_classes])),
        'biases': tf.Variable(tf.random_normal([n_classes]))
    }

    train_input = tf.transpose(train_input, [1, 0, 2])
    train_input = tf.reshape(train_input, [-1, chunk_size])
    train_input = tf.split(train_input, n_chunks, 0)

    lstm_cell = core_rnn_cell.BasicLSTMCell(rnn_size, state_is_tuple=True)
    outputs, states = rnn.static_rnn(lstm_cell, train_input, dtype=tf.float32)

    output = tf.matmul(outputs[-1], layer['weights']) + layer['biases']

    return output
def seq_predict_model(X, w, b, time_step_size, vector_size):
    # 数组转置函数
    # X转为:[time_step_size,batch_size,vector_size]
    X = tf.transpose(X, [1, 0, 2])
    # 调整tensor X的维度  -1表示不指定维度
    # X最终的shape为:[time_step_size*batch_size, vector_size]
    X = tf.reshape(X, [-1, vector_size])
    # 以第0维度,把X分为time_step_size份,切分后的shape为[batch_size, vector_size]
    X = tf.split(X, time_step_size, 0)

    cell = core_rnn_cell.BasicLSTMCell(num_units=10,
                                       forget_bias=1.0,
                                       state_is_tuple=True)
    outputs, _states = core_rnn.static_rnn(cell, X, dtype=tf.float32)

    return tf.matmul(outputs[-1], w) + b, cell.state_size
Exemple #4
0
def seq_predict_model(X, w, b, time_step_size, vector_size):
    # input X shape: [batch_size, time_step_size, vector_size]
    # transpose X to [time_step_size, batch_size, vector_size]
    X = tf.transpose(X, [1, 0, 2])
    # reshape X to [time_step_size * batch_size, vector_size]
    X = tf.reshape(X, [-1, vector_size])
    # split X, array[time_step_size], shape: [batch_size, vector_size]
    X = tf.split(X, time_step_size, 0)

    # LSTM model with state_size = 10
    cell = core_rnn_cell.BasicLSTMCell(num_units=10,
                                       forget_bias=1.0,
                                       state_is_tuple=True)
    outputs, _states = core_rnn.static_rnn(cell, X, dtype=tf.float32)

    # Linear activation
    return tf.matmul(outputs[-1], w) + b, cell.state_size
Exemple #5
0
    def __init__(self, is_training, config, input_):
        self._input = input_

        batch_size = input_.batch_size
        num_steps = input_.num_steps
        size = config.hidden_size
        vocab_size = config.vocab_size

        # Slightly better results can be obtained with forget gate biases
        # initialized to 1 but the hyperparameters of the model would need to be
        # different than reported in the paper.
        lstm_cell = core_rnn_cell.BasicLSTMCell(num_units=size,
                                                state_is_tuple=True)
        if is_training and config.keep_prob < 1:
            lstm_cell = tf.contrib.rnn.DropoutWrapper(
                lstm_cell, output_keep_prob=config.keep_prob)
        cell = core_rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers,
                                          state_is_tuple=True)

        self._initial_state = cell.zero_state(batch_size, data_type())

        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, size],
                                        dtype=data_type())
            inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        outputs = []
        state = self._initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)

        output = tf.reshape(tf.concat(outputs, 1), [-1, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size],
                                    dtype=data_type())
        softmax_b = tf.get_variable("softmax_b", [vocab_size],
                                    dtype=data_type())
        logits = tf.matmul(output, softmax_w) + softmax_b
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(input_.targets, [-1])],
            [tf.ones([batch_size * num_steps], dtype=data_type())])
        self._cost = cost = tf.reduce_sum(loss) / batch_size
        self._final_state = state

        if not is_training:
            return

        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        self._new_lr = tf.placeholder(tf.float32,
                                      shape=[],
                                      name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)
Exemple #6
0
	def __init__(self, pre_trained_seq2seq, pre_trained_backward, vocab_size, buckets, layer_size, num_layers, 
		max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, 
		use_lstm=False, num_samples=512, forward_only = False, dtype= tf.float32):

		"""Create a Model:
		Similar to the seq2seq_model_rl.py code but it has differences in:
		- loss function
		-
		INPUTS:
			vocab_size: size of vocabulary
			buckets: a list of pairs (I,O), where I specifies maximum input length that 
				will be processed in that bucket, and O specifies maximum output length. Traning 
				instances that have inputs longer than I or outputs longer than O will be pushed 
				to the next bucket and padded accordingly. We assume that the list is sorted.
				** We may not use bucketing for Dialogue.
			layer_size: the number of units in each layer
			num_layers: the number of the layers in the model
			max_gradient_norm : gradients will be clipped to maximally this norm?
			candidate_size : the number of candidates (actions)
			learning_rate : learning rate to start with.
			learning_rate_decay_factor : decay learning rate by this much when needed.
			use_lstm: True -> LSTM cells, False -> GRU cells
			num_samples: the number of samples for sampled softmax
			forward_only : if set, we do not construct the backward pass in the model
			dtype: the data type to use to store internal variables.
		"""
		self.vocab_size = vocab_size
		self.buckets = buckets
		self.buckets_back = [(x[1],x[1]) for x in buckets]
		self.batch_size = batch_size
		self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype = dtype)
		self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate*learning_rate_decay_factor)
		self.global_step = tf.Variable(0, trainable=False)
		self.pre_trained_seq2seq = pre_trained_seq2seq
		self.pre_trained_backward = pre_trained_backward
		#self.bucket_id = tf.placeholder(tf.int32, shape=(2,), name="bucket_id") # [bucket_id, 0]
		self.bucket_id = 0
		# Variables
		
		w_t = tf.get_variable("proj_w",[self.vocab_size, layer_size], dtype = dtype)
		w = tf.transpose(w_t)
		b = tf.get_variable("proj_b", [self.vocab_size], dtype=dtype)
		output_projection = (w,b)

		if use_lstm:
			single_cell = core_rnn_cell.BasicLSTMCell(layer_size)
		else:
			single_cell = core_rnn_cell.GRUCell(layer_size)

		if num_layers > 1:
			cell = core_rnn_cell.MultiRNNCell([single_cell]*num_layers)
		else:
			cell = single_cell

		def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
			return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
			 num_encoder_symbols = vocab_size, num_decoder_symbols=vocab_size, embedding_size = layer_size, 
			 output_projection = output_projection, feed_previous = do_decode, dtype = dtype)
		
		self.states, self.states_back, self.action_dums = [], [], [] # states_back : the 2nd half of the states (each)
		self.actions , self.actions_back = [], []
		self.weights, self.weights_back = [],[]

		for i in xrange(self.buckets[-1][0]):
			self.states.append(tf.placeholder(tf.int32, shape=[None], name ="state{0}".format(i)))

		for i in xrange(self.buckets[-1][1]):
			self.action_dums.append(tf.placeholder(tf.int32, shape=[None], name ="action_dum{0}".format(i)))
			self.actions.append(tf.placeholder(tf.int32, shape=[None], name ="action{0}".format(i)))
			self.weights.append(tf.placeholder(dtype, shape=[None], name ="weight_rl{0}".format(i)))

		for i in xrange(self.buckets_back[-1][0]):
			self.actions_back.append(tf.placeholder(tf.int32, shape=[None], name ="action_back{0}".format(i)))

		for i in xrange(self.buckets_back[-1][1]):
			self.states_back.append(tf.placeholder(tf.int32, shape=[None], name ="state_back{0}".format(i)))

		for i in xrange(self.buckets[-1][1]):
			self.weights_back.append(tf.placeholder(dtype, shape=[None], name="weight_rl_back{0}".format(i)))

		# 1. Get batch actions 
		#>>self.actions, self.actions_back, self.weights, self.joint_logits = self.generate_batch_action(self.states, self.action_dums, self.bucket_id, lambda x,y:seq2seq_f(x,y,True), output_projection= output_projection)
		self.actions_sam, self.logprob = self.generate_batch_action(self.states, self.action_dums, self.bucket_id, lambda x,y:seq2seq_f(x,y,True), output_projection= output_projection)
		# 2. Get the loss
		def mi_score(states, actions, weights, states_back, actions_back, weights_back):
			
			"""
			Args
			#	states, states_back, weights_back : placeholder
			#	actions, actions_back, weights : from generate_batch_action 
			"""
			#self.feeding_data(self.pre_trained_seq2seq, self.buckets, states, actions, weights)
			#self.feeding_data(self.pre_trained_backward, self.buckets_back, actions_back, states_back, weights_back)

			#output_logits = tf.slice(tf.constant(output_logits, dtype=tf.float32), self.bucket_id, [1,-1])
			
			# if self.bucket_id < (len(self.buckets)-1):
			# 	for i in xrange(self.buckets[-1][1]-self.buckets[self.bucket_id][1]):
			# 		actions.append(tf.placeholder(tf.int32, shape=[None], name="action{0}".format(i+self.buckets[self.bucket_id][1])))
			# 		weights.append(tf.placeholder(tf.int32, shape=[None], name="weight_rl{0}".format(i+self.buckets[self.bucket_id][1])))
			# with tf.variable_scope("forward", reuse=True) as scope:
			# 	scope.reuse_variables()
			# 	output_logits,_ = tf.contrib.legacy_seq2seq.model_with_buckets(states, actions, actions[0:],weights, self.buckets, lambda x,y: self.pre_trained_seq2seq.seq2seq_f(x,y,True), softmax_loss_function=self.pre_trained_seq2seq.softmax_loss_function)
			
			output_logits = self.pre_trained_seq2seq.outputs[self.bucket_id]
			#output_logprob = [-tf.log(tf.ones(shape = (self.batch_size, self.vocab_size), dtype=tf.float32) + tf.exp(-logit)) for logit in output_logits]
			log_prob = []
			logprob_s2s = tf.nn.log_softmax(output_logits,dim=0)

			for word_idx in xrange(self.buckets[self.bucket_id][1]):
				one_hot_mat = tf.one_hot(actions[word_idx],depth=self.vocab_size, on_value = 1.0, off_value=0.0, axis =1, dtype=tf.float32 )	
				tmp1 = tf.reshape(tf.slice(logprob_s2s, [word_idx,0,0],[1,-1,-1]), shape = (self.batch_size, self.vocab_size))
				log_prob_word = tf.subtract(tf.reduce_sum(tf.multiply(tmp1 , one_hot_mat),1), tf.log(tf.reduce_sum(tf.exp(tmp1),1)))
				log_prob.append(tf.multiply(log_prob_word, weights[word_idx]))
			
			output_logits_back = self.pre_trained_backward.outputs[self.bucket_id]
			#output_logprob_back = [-tf.log(tf.ones(shape = (self.batch_size, self.vocab_size), dtype=tf.float32) + tf.exp(-logit)) for logit in output_logits_back]
			log_prob_back = []
			logprob_back = tf.nn.log_softmax(output_logits_back,dim=0)
			w_back_new = [np.ones(self.batch_size, dtype = np.float32)] + weights_back[:-1]
			
			for word_idx in xrange(self.buckets_back[self.bucket_id][1]):
				one_hot_mat = tf.one_hot(states_back[word_idx],depth=self.vocab_size, on_value = 1.0, off_value=0.0, axis =1, dtype=tf.float32 )	
				tmp2 = tf.reshape(tf.slice(logprob_back, [word_idx,0,0],[1,-1,-1]), shape = (self.batch_size, self.vocab_size))
				log_prob_word = tf.subtract(tf.reduce_sum(tf.multiply(tmp2 , one_hot_mat),1), tf.log(tf.reduce_sum(tf.exp(tmp2),1)))
				log_prob_back.append(tf.multiply(log_prob_word, w_back_new[word_idx]))
			
			return tf.divide(tf.add_n(log_prob), tf.add_n(weights[:self.buckets[self.bucket_id][1]])) + tf.divide(tf.add_n(log_prob_back), tf.add_n(w_back_new[:self.buckets_back[self.bucket_id][1]])) #+ tf.constant(20.0, shape=(self.batch_size,), dtype = tf.float32)
		
		if not forward_only:
			self.neg_penalty = tf.placeholder(tf.float32, shape=[None], name="neg_penalty") #repeat_penalty(self.actions)
			self.reward =  mi_score(self.states, self.actions, self.weights, self.states_back, self.actions_back, self.weights_back) + tf.scalar_mul(tf.constant(0.05,shape=()), tf.add_n(self.weights[:self.buckets[self.bucket_id][1]]))
			joint_logprob = tf.reduce_sum(self.logprob,axis=0)
			# 3. Gradient Descent Optimization
			params = [x for x in tf.trainable_variables() if "mi" in str(x.name).split("/")]
			cost = tf.scalar_mul(tf.constant(-1.0,shape=()), tf.add(self.neg_penalty, self.reward)) #tf.add(self.neg_penalty, self.reward)
			opt = tf.train.GradientDescentOptimizer(self.learning_rate)
			gradients = tf.gradients(tf.matmul(tf.reshape(cost, shape=(self.batch_size,1)), tf.reshape(joint_logprob,shape=(self.batch_size,1)), transpose_a=True), params)
			clipped_gradients, global_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) # Clips values of multiple tensors by the ratio of the sum of their norms.
			self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) #An Operation that applies the specified gradients. If global_step was not None, that operation also increments global_step.

		self.names = {str(x.name).split(":0")[0] : x for x in tf.global_variables() if 'mi' in str(x.name).split("/")}	
		self.saver = tf.train.Saver(self.names)
Exemple #7
0
    def __init__(self,
                 vocab_size,
                 buckets,
                 layer_size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 MI_use=False,
                 forward_only=False,
                 dtype=tf.float32):
        """Create a Model:
		Similar to the seq2seq_model.py code in the tensorflow version 0.12.1
		INPUTS:
			vocab_size: size of vocabulary
			buckets: a list of pairs (I,O), where I specifies maximum input length that 
				will be processed in that bucket, and O specifies maximum output length. Traning 
				instances that have inputs longer than I or outputs longer than O will be pushed 
				to the next bucket and padded accordingly. We assume that the list is sorted.
				** We may not use bucketing for Dialogue.
			layer_size: the number of units in each layer
			num_layers: the number of the layers in the model
			max_gradient_norm : gradients will be clipped to maximally this norm?
			batch_size : the size of the batches used during training; the model construction
				is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding.
			learning_rate : learning rate to start with.
			learning_rate_decay_factor : decay learning rate by this much when needed.
			use_lstm: True -> LSTM cells, False -> GRU cells
			num_samples: the number of samples for sampled softmax
			forward_only : if set, we do not construct the backward pass in the model
			dtype: the data type to use to store internal variables.
		
		"""
        self.vocab_size = vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=dtype)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        output_projection = None
        softmax_loss_function = None

        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.vocab_size:
            w_t = tf.get_variable("proj_w", [self.vocab_size, layer_size],
                                  dtype=dtype)
            w = tf.transpose(w_t)
            b = tf.get_variable("proj_b", [self.vocab_size], dtype=dtype)
            output_projection = (w, b)

            def sampled_loss(
                labels, inputs
            ):  # The order is opposite to the order in 0.12.x version!!! What the hell?

                labels = tf.reshape(labels, [-1, 1])  # -1 makes it 1-D.
                # We need to compute the sampled_softmax_loss using 32bit flotas to avoid numerical instabilities.
                local_w_t = tf.cast(w_t, tf.float32)
                local_b = tf.cast(b, tf.float32)
                local_inputs = tf.cast(inputs, tf.float32)
                # tf.nn -> <module 'tensorflow.python.ops.nn' from 'PATH/tensorflow/python/ops/nn.pyc'>
                return tf.cast(
                    tf.nn.sampled_softmax_loss(weights=local_w_t,
                                               biases=local_b,
                                               labels=labels,
                                               inputs=local_inputs,
                                               num_sampled=num_samples,
                                               num_classes=self.vocab_size),
                    dtype)

        softmax_loss_function = sampled_loss
        self.softmax_loss_function = softmax_loss_function

        # Create the internal multi-layer cell for our RNN.
        if use_lstm:
            single_cell = core_rnn_cell.BasicLSTMCell(layer_size)
        else:
            single_cell = core_rnn_cell.GRUCell(layer_size)

        if num_layers > 1:
            cell = core_rnn_cell.MultiRNNCell([single_cell] * num_layers)
        else:
            cell = single_cell

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=vocab_size,
                num_decoder_symbols=vocab_size,
                embedding_size=layer_size,
                output_projection=output_projection,
                feed_previous=do_decode,
                dtype=dtype)

        self.seq2seq_f = seq2seq_f
        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(
                                   i)))  # "encoder{0}".format(N) -> 'encoderN'

        for i in xrange(buckets[-1][1] + 1):  # For EOS
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(dtype, shape=[None],
                               name="weight{0}".format(i)))
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]  # (i+1) because of GO symbol at the beginning

        # Training outputs and losses (a list(len(buckets) of 1-D batched size tensors)
        if forward_only:
            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        params = tf.trainable_variables(
        )  # Returns all variables created with trainable=True
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, global_norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm
                )  # Clips values of multiple tensors by the ratio of the sum of their norms.
                self.gradient_norms.append(global_norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step)
                )  #An Operation that applies the specified gradients. If global_step was not None, that operation also increments global_step.

        if MI_use:
            self.names = {
                str(x.name).split(":0")[0]: x
                for x in tf.global_variables()
                if 'forward' in str(x.name).split("/")
            }
            self.saver = tf.train.Saver(self.names)
        else:
            self.saver = tf.train.Saver(tf.global_variables())
Exemple #8
0
	def __init__(self, sess, pre_trained_seq2seq, pre_trained_backward, vocab_size, buckets, layer_size, num_layers, 
		max_gradient_norm, candidate_size, learning_rate, learning_rate_decay_factor, 
		use_lstm=False, num_samples=512, forward_only = False, dtype= tf.float32):

		"""Create a Model:
		Similar to the seq2seq_model_rl.py code but it has differences in:
		- loss function
		-
		INPUTS:
			vocab_size: size of vocabulary
			buckets: a list of pairs (I,O), where I specifies maximum input length that 
				will be processed in that bucket, and O specifies maximum output length. Traning 
				instances that have inputs longer than I or outputs longer than O will be pushed 
				to the next bucket and padded accordingly. We assume that the list is sorted.
				** We may not use bucketing for Dialogue.
			layer_size: the number of units in each layer
			num_layers: the number of the layers in the model
			max_gradient_norm : gradients will be clipped to maximally this norm?
			candidate_size : the number of candidates (actions)
			learning_rate : learning rate to start with.
			learning_rate_decay_factor : decay learning rate by this much when needed.
			use_lstm: True -> LSTM cells, False -> GRU cells
			num_samples: the number of samples for sampled softmax
			forward_only : if set, we do not construct the backward pass in the model
			dtype: the data type to use to store internal variables.
		"""
		self.sess = sess
		self.vocab_size = vocab_size
		self.buckets = buckets
		self.buckets_back = [(x[1],x[1]) for x in buckets]
		self.batch_size = """? necessary?"""
		self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype = dtype)
		self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate*learning_rate_decay_factor)
		self.global_step = tf.Variable(0, trainable=False)
		self.pre_trained_seq2seq = pre_trained_seq2seq
		self.pre_trained_backward = pre_trained_backward
		self.bucket_id = len(buckets)-1
		if num_samples > 0 and num_samples < self.vocab_size:
			w_t = tf.get_variable("proj_w_mi",[self.vocab_size, layer_size], dtype = dtype)
			w = tf.transpose(w_t)
			b = tf.get_variable("proj_b_mi", [self.vocab_size], dtype=dtype)
			output_projection = (w,b)

			"""
			def mi_score(states, actions, weights, states_back, actions_back, weights_back, bucket_id):
				#Args:
				#	states:[first utterance, second utterance]
				#	actions: action utterance

				pdb.set_trace()
				#bucket_id = min([b for b in xrange(len(self.buckets)) if self.buckets[b][0] > len(states)])
				states_input = self.sess.run()
				_, _, output_logits = self.pre_trained_seq2seq.step(self.sess, states, actions, weights, bucket_id, True)
				# output_logits: 
				log_prob = []
				for word_idx in xrange(len(actions)):
					tmp = [output_logits[word_idx][batch_idx][actions[word_idx][batch_idx]] - np.log(sum(np.exp(output_logits[word_idx][batch_idx]))) for batch_idx in xrange(batch_size)]
					log_prob.append(np.inner(tmp, weights[word_idx]))

				#bucket_id_back = min([b for b in xrange(len(self.buckets_back)) if self.buckets_back[b][0] > len(states_back)])
				_, _, output_logits_back = self.pre_trained_backward.step(self.sess, actions_back, states_back, weights_back, bucket_id, True)

				log_prob_back = []
				for word_idx in xrange(len(states_back)):
					tmp = [output_logits_back[word_idx][batch_idx][states_back[word_idx][batch_idx]] - np.log(sum(np.exp(output_logits_back[word_idx][batch_idx]))) for batch_idx in xrange(batch_size)]
					log_prob_back.append(np.inner(tmp, weights_back[word_idx]))

				# -log_prob/float(len(action)) - log_prob_back/float(len(state[1]))
				return -sum(log_prob)/float(len(actions)) - log_prob_back/float(len(states_back))

			loss_function = mi_score	
			"""

		if use_lstm:
			single_cell = core_rnn_cell.BasicLSTMCell(layer_size)
		else:
			single_cell = core_rnn_cell.GRUCell(layer_size)

		if num_layers > 1:
			cell = core_rnn_cell.MultiRNNCell([single_cell]*num_layers)
		else:
			cell = single_cell

		def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
			return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
			 num_encoder_symbols = vocab_size, num_decoder_symbols=vocab_size, embedding_size = layer_size, 
			 output_projection = output_projection, feed_previous = do_decode, dtype = dtype)
		self.seq2seq_f = seq2seq_f

		self.states, self.states_back = [], []
		self.actions , self.actions_back = [], []
		self.weights, self.weights_back = [], []

		for i in xrange(self.buckets[-1][0]):
			self.states.append(tf.placeholder(tf.int32, shape=[None], name ="state{0}".format(i)))

		for i in xrange(self.buckets_back[-1][1]):
			self.states_back.append(tf.placeholder(tf.int32, shape=[None], name ="state_back{0}".format(i)))

		for i in xrange(self.buckets[-1][1]):
			self.actions.append(tf.placeholder(tf.int32, shape=[None], name ="action{0}".format(i)))
			self.actions_back.append(tf.placeholder(tf.int32, shape=[None], name ="action_back{0}".format(i)))
			self.weights.append(tf.placeholder(dtype, shape=[None], name="weight_rl{0}".format(i)))
			self.weights_back.append(tf.placeholder(dtype, shape=[None], name="weight_rl_back{0}".format(i)))

		#self.losses = loss_function(self.states, self.actions, self.weights, self.states_back, self.actions_back, self.weights_back, self.bucket_id)
		self.losses = []
		for i in xrange(len(buckets)):
			self.losses.append(tf.placeholder(tf.float32, shape = [None], name = "losses{0}".format(i)))

		params = tf.trainable_variables()
		pdb.set_trace()
		if not forward_only:
			self.gradient_norms = []
			self.updates = []
			opt = tf.train.GradientDescentOptimizer(self.learning_rate)
			for b in xrange(len(buckets)):
				gradients = tf.gradients(self.losses[b],params)
				clipped_gradients, global_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) # Clips values of multiple tensors by the ratio of the sum of their norms.
				self.gradient_norms.append(global_norm)
				self.updates.append(opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) #An Operation that applies the specified gradients. If global_step was not None, that operation also increments global_step.
				#self.updates.append(opt.minimize(self.losses[b],params))

		self.saver = tf.train.Saver(tf.global_variables())
Exemple #9
0
    def build_model(self):
        self.x = tf.placeholder(tf.int32, [self.batch_size, self.XMAXLEN],
                                name="premise")
        self.x_length = tf.placeholder(tf.int32, [self.batch_size],
                                       name="premise_len")
        self.y = tf.placeholder(tf.int32, [self.batch_size, self.YMAXLEN],
                                name="hypothesis")
        self.y_length = tf.placeholder(tf.int32, [self.batch_size],
                                       name="hyp_len")
        self.target = tf.placeholder(
            tf.float32, [self.batch_size, 3],
            name="label")  # change this to int32 and it breaks.

        # DO NOT DO THIS
        # self.batch_size = tf.shape(self.x)[0]  # batch size
        # self.x_length = tf.shape(self.x)[1]  # batch size
        # print self.batch_size,self.x_length

        self.embed_matrix = tf.get_variable("embeddings",
                                            [self.vocab_size, self.dim])
        self.x_emb = tf.nn.embedding_lookup(self.embed_matrix, self.x)
        self.y_emb = tf.nn.embedding_lookup(self.embed_matrix, self.y)

        print(self.x_emb, self.y_emb)
        with tf.variable_scope("encode_x"):
            self.fwd_lstm = core_rnn_cell.BasicLSTMCell(self.h_dim,
                                                        state_is_tuple=True)
            self.x_output, self.x_state = tf.nn.dynamic_rnn(cell=self.fwd_lstm,
                                                            inputs=self.x_emb,
                                                            dtype=tf.float32)
            # self.x_output, self.x_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=self.fwd_lstm,cell_bw=self.bwd_lstm,inputs=self.x_emb,dtype=tf.float32)
            print(self.x_output)
            # print self.x_state
        # print tf.shape(self.x)
        with tf.variable_scope("encode_y"):
            self.fwd_lstm = core_rnn_cell.BasicLSTMCell(self.h_dim,
                                                        state_is_tuple=True)
            self.y_output, self.y_state = tf.nn.dynamic_rnn(
                cell=self.fwd_lstm,
                inputs=self.y_emb,
                initial_state=self.x_state,
                dtype=tf.float32)
            # print self.y_output
            # print self.y_state

        self.Y = self.x_output  # its length must be x_length

        # self.h_n = self.last_relevant(self.y_output,self.x_length)   # TODO
        tmp5 = tf.transpose(self.y_output, [1, 0, 2])
        self.h_n = tf.gather(tmp5, int(tmp5.get_shape()[0]) - 1)
        print(self.h_n)

        # self.h_n_repeat = self.repeat(self.h_n,self.x_length)   # TODO
        self.h_n_repeat = tf.expand_dims(self.h_n, 1)
        pattern = tf.stack([1, self.XMAXLEN, 1])
        self.h_n_repeat = tf.tile(self.h_n_repeat, pattern)

        self.W_Y = tf.get_variable("W_Y", shape=[self.h_dim, self.h_dim])
        self.W_h = tf.get_variable("W_h", shape=[self.h_dim, self.h_dim])

        # TODO compute M = tanh(W*Y + W*[h_n...])
        tmp1 = tf.matmul(tf.reshape(
            self.Y, shape=[self.batch_size * self.XMAXLEN, self.h_dim]),
                         self.W_Y,
                         name="Wy")
        self.Wy = tf.reshape(tmp1,
                             shape=[self.batch_size, self.XMAXLEN, self.h_dim])
        tmp2 = tf.matmul(
            tf.reshape(self.h_n_repeat,
                       shape=[self.batch_size * self.XMAXLEN, self.h_dim]),
            self.W_h)
        self.Whn = tf.reshape(
            tmp2,
            shape=[self.batch_size, self.XMAXLEN, self.h_dim],
            name="Whn")
        self.M = tf.tanh(tf.add(self.Wy, self.Whn), name="M")
        # print "M",self.M

        # use attention
        self.W_att = tf.get_variable("W_att", shape=[self.h_dim, 1])  # h x 1
        tmp3 = tf.matmul(
            tf.reshape(self.M,
                       shape=[self.batch_size * self.XMAXLEN, self.h_dim]),
            self.W_att)
        # need 1 here so that later can do multiplication with h x L
        self.att = tf.nn.softmax(
            tf.reshape(tmp3,
                       shape=[self.batch_size, 1, self.XMAXLEN],
                       name="att"))  # nb x 1 x Xmax
        # print "att",self.att

        # COMPUTE WEIGHTED
        self.r = tf.reshape(tf.matmul(self.att, self.Y, name="r"),
                            shape=[self.batch_size, self.h_dim
                                   ])  # (nb,1,L) X (nb,L,k) = (nb,1,k)
        # get last step of Y as r which is (batch,k)
        # tmp4 = tf.transpose(self.Y, [1, 0, 2])
        # self.r = tf.gather(tmp4, int(tmp4.get_shape()[0]) - 1)
        # print "r",self.r

        self.W_p, self.b_p = tf.get_variable(
            "W_p",
            shape=[self.h_dim, self.h_dim
                   ]), tf.get_variable("b_p",
                                       shape=[self.h_dim],
                                       initializer=tf.constant_initializer())
        self.W_x, self.b_x = tf.get_variable(
            "W_x",
            shape=[self.h_dim, self.h_dim
                   ]), tf.get_variable("b_x",
                                       shape=[self.h_dim],
                                       initializer=tf.constant_initializer())
        self.Wpr = tf.matmul(self.r, self.W_p, name="Wy") + self.b_p
        self.Wxhn = tf.matmul(self.h_n, self.W_x, name="Wxhn") + self.b_x
        self.hstar = tf.tanh(tf.add(self.Wpr, self.Wxhn), name="hstar")
        # print "Wpr",self.Wpr
        # print "Wxhn",self.Wxhn
        # print "hstar",self.hstar

        self.W_pred = tf.get_variable("W_pred", shape=[self.h_dim, 3])
        self.pred = tf.nn.softmax(tf.matmul(self.hstar, self.W_pred),
                                  name="pred_layer")
        # print "pred",self.pred,"target",self.target
        correct = tf.equal(tf.argmax(self.pred, 1), tf.argmax(self.target, 1))
        self.acc = tf.reduce_mean(tf.cast(correct, "float"), name="accuracy")
        # self.H_n = self.last_relevant(self.en_output)
        self.loss = -tf.reduce_sum(self.target * tf.log(self.pred),
                                   name="loss")
        # print self.loss
        self.optimizer = tf.train.AdamOptimizer()
        self.optim = self.optimizer.minimize(self.loss,
                                             var_list=tf.trainable_variables())
        _ = tf.summary.scalar("loss", self.loss)