def lstm(): #参数:输入网络批次数目 global input_x, input_rnn, output w_in = weights['in'] b_in = biases['in'] input_x = tf.reshape(X, [-1, input_size]) #需要将tensor转成2维进行计算,计算后的结果作为隐藏层的输入 input_rnn = tf.matmul(input_x, w_in) + b_in input_rnn = tf.reshape( input_rnn, [-1, num_steps, num_units]) #将tensor转成3维,作为lstm cell的输入 rnn = CudnnLSTM(num_layers, num_units, input_size, input_mode='linear_input', direction='unidirectional', dropout=0.5 if is_training else keep_prob, seed=0) params_size_t = rnn.params_size() params = tf.Variable(tf.random_uniform([params_size_t], minval=-0.1, maxval=0.1, dtype=tf.float32), validate_shape=False) output, output_h, output_c = rnn(is_training=is_training, input_data=input_rnn, input_h=H, input_c=C, params=params) output = tf.reshape(output, [-1, num_units]) #作为输出层的输入 w_out = weights['out'] b_out = biases['out'] pred = tf.matmul(output, w_out) + b_out return pred
def __init__(self, is_training, config, debug=False): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps self.size = size = config.hidden_size vocab_size = config.vocab_size self.num_layers = config.num_layers self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) embedding = tf.get_variable("embedding", [vocab_size, size], dtype=data_type(is_lstm_layer=False)) inputs = tf.nn.embedding_lookup(embedding, self._input_data, name="inputs_to_rnn") if debug: variable_summaries(inputs, "inputs_to_rnn") if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) rnn = CudnnLSTM(config.num_layers, size, size, input_mode='linear_input', direction='unidirectional', dropout=config.keep_prob, seed=0, seed2=0) params_size_t = rnn.params_size() self._initial_input_h = tf.placeholder(data_type(is_lstm_layer=True), shape=[config.num_layers, batch_size, size]) #self._initial_input_h = tf.Variable(tf.zeros([config.num_layers, batch_size, size])) self._initial_input_c = tf.placeholder(data_type(is_lstm_layer=True), shape=[config.num_layers, batch_size, size]) #self._initial_input_c = tf.Variable(tf.zeros([config.num_layers, batch_size, size])) #self.params = tf.get_variable("params", [params_size_t], validate_shape=False, dtype=data_type(is_lstm_layer=False)) self.params = tf.Variable(tf.random_uniform([params_size_t], minval=-config.init_scale, maxval=config.init_scale, dtype=data_type(is_lstm_layer=True)), validate_shape=False) self.params_size_t = rnn.params_size() outputs, output_h, output_c = rnn(is_training=is_training, input_data=tf.transpose(tf.cast(inputs, dtype=data_type(is_lstm_layer=True)), [1, 0, 2]), input_h=self.input_h, input_c=self.input_c, params=self.params) self._output_h = output_h self._output_c = output_c output = tf.reshape(tf.concat(values=tf.transpose(outputs, [1, 0, 2]), axis=1), [-1, size]) if debug: variable_summaries(output, 'multiRNN_output') softmax_w = tf.get_variable("softmax_w", [size, vocab_size], dtype=data_type(is_lstm_layer=False)) softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type(is_lstm_layer=False)) logits = tf.matmul(output if output.dtype == data_type(is_lstm_layer=False) else tf.cast(output, data_type(is_lstm_layer=False)), softmax_w) + softmax_b if debug: variable_summaries(logits, 'logits') #loss = tf.contrib.nn.seq2seq.sequence_loss_by_example( loss = sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps], dtype=data_type(is_lstm_layer=False))]) self._cost = cost = tf.reduce_sum(loss) / batch_size if FLAGS.cost_function == 'avg': self._cost_to_optimize = cost_to_optimize = tf.reduce_mean(loss) else: self._cost_to_optimize = cost_to_optimize = cost tvars = tf.trainable_variables() for v in tvars: cost_to_optimize += FLAGS.reg_term * tf.cast(tf.nn.l2_loss(v), dtype=data_type(False)) / (batch_size*config.num_steps) self._cost_to_optimize = cost_to_optimize if debug: tf.summary.scalar('cost no regularization', cost) tf.summary.scalar('cost_to_optimize', cost_to_optimize) #self._final_state = state if not is_training: self.merged = tf.summary.merge_all() return self._lr = tf.Variable(0.0, trainable=False, dtype=data_type(is_lstm_layer=False)) #if debug: # tf.scalar_summary('learning rate', self._lr) #tvars = tf.trainable_variables() type2vars = dict() print("**************************") print("Trainable Variables") print("**************************") for var in tvars: print('Variable name: %s. With dtype: %s and shape: %s' % (var.name, var.dtype, var.get_shape())) if var.dtype not in type2vars: type2vars[var.dtype] = [var] else: type2vars[var.dtype].append(var) print("**************************") print("Gradients Variables") print("**************************") _grads = tf.gradients(cost_to_optimize, tvars) type2grads = dict() for g in _grads: print('Gradient name: %s. With dtype: %s' % (g.name, g.dtype)) if g.dtype not in type2grads: type2grads[g.dtype] = [g] else: type2grads[g.dtype].append(g) type2clippedGrads = dict() for dtype in type2grads: cgrads, _ = tf.clip_by_global_norm(type2grads[dtype], config.max_grad_norm) type2clippedGrads[dtype] = cgrads if debug: for (gkey, vkey) in zip(type2clippedGrads.keys(),type2vars.keys()): for (clipped_gradient, variable) in zip(type2clippedGrads[gkey], type2vars[vkey]): variable_summaries(clipped_gradient, "clipped_dcost/d"+variable.name) variable_summaries(variable, variable.name) if FLAGS.optimizer == 'MomentumOptimizer': optimizer = tf.train.MomentumOptimizer(learning_rate=self._lr, momentum=0.9) elif FLAGS.optimizer == 'AdamOptimizer': optimizer = tf.train.AdamOptimizer() elif FLAGS.optimizer == 'RMSPropOptimizer': optimizer = tf.train.RMSPropOptimizer(learning_rate=self._lr) elif FLAGS.optimizer == 'AdagradOptimizer': optimizer = tf.train.AdagradOptimizer(learning_rate=self._lr) else: optimizer = tf.train.GradientDescentOptimizer(self._lr) allgrads = [] allvars = [] for dtype in type2clippedGrads: allgrads += type2clippedGrads[dtype] #WARNING: key order assumption for dtype in type2vars: allvars += type2vars[dtype] self._train_op = optimizer.apply_gradients(zip(allgrads, allvars)) self._new_lr = tf.placeholder(dtype=data_type(False), shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr) self.merged = tf.summary.merge_all()
def __init__(self, is_training, config, debug=False): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps self.size = size = config.hidden_size self.num_layers = config.num_layers self._input_data = tf.placeholder(tf.float32, [batch_size, num_steps, 1]) self._targets = tf.placeholder(tf.float32, [batch_size, num_steps]) inputs = self._input_data if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) rnn = CudnnLSTM(config.num_layers, size, size, input_mode='linear_input', direction='unidirectional', dropout=config.keep_prob, seed=0) params_size_t = rnn.params_size() self._initial_input_h = tf.placeholder( data_type(is_lstm_layer=True), shape=[config.num_layers, config.num_steps, size] ) #self._initial_input_h = tf.Variable(tf.zeros([config.num_layers, batch_size, size])) self._initial_input_c = tf.placeholder( data_type(is_lstm_layer=True), shape=[config.num_layers, config.num_steps, size] ) #self._initial_input_c = tf.Variable(tf.zeros([config.num_layers, batch_size, size])) #self.params = tf.get_variable("params", [params_size_t], validate_shape=False, dtype=data_type(is_lstm_layer=False)) self.params = tf.Variable(tf.random_uniform( [params_size_t], minval=-config.init_scale, maxval=config.init_scale, dtype=data_type(is_lstm_layer=True)), validate_shape=False) self.params_size_t = rnn.params_size() # outputs, output_h, output_c = rnn(is_training=is_training, input_data=tf.transpose(tf.cast(inputs, dtype=data_type(is_lstm_layer=True)), [1, 0, 2]), input_h=self.input_h, # input_c=self.input_c, params=self.params) output, output_h, output_c = rnn(is_training=is_training, input_data=inputs, input_h=self.input_h, input_c=self.input_c, params=self.params) self._output_h = output_h self._output_c = output_c # output = tf.reshape(tf.transpose(outputs, [1, 0, 2]), [-1, size]) softmax_w = tf.get_variable("softmax_w", [batch_size, size, 1], dtype=data_type(is_lstm_layer=False)) softmax_b = tf.get_variable("softmax_b", [batch_size, num_steps], dtype=data_type(is_lstm_layer=False)) # logits = tf.matmul(output if output.dtype == data_type(is_lstm_layer=False) else tf.cast(output, data_type(is_lstm_layer=False)), softmax_w) + softmax_b logits = tf.matmul(output, softmax_w) logits = tf.reshape(logits, [batch_size, num_steps]) logits = logits + softmax_b self._logits = logits loss = tf.sqrt(tf.losses.mean_squared_error(logits, self._targets)) self._cost = cost = loss if not is_training: return self._lr = tf.Variable(0.0, trainable=False, dtype=data_type(is_lstm_layer=False)) tvars = tf.trainable_variables() self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self._cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) self._new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr)