def __init__(self, args, infer=False): '''these arguments appear in full in train.py''' self.args = args '''it seems this will never happen''' if infer: args.batch_size = 1 args.seq_length = 1 '''the types of models at our disposal''' if args.model == 'rnn': cell_fn = rnn.BasicRNNCell elif args.model == 'gru': cell_fn = rnn.GRUCell elif args.model == 'lstm': cell_fn = rnn.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) '''this is a placeholder for dropout, defaults to 0 for computing f(x)''' self.dropout = tf.placeholder_with_default(0., shape=()) '''the structure of the cell is formed here''' cells = [] for _ in range(args.num_layers): cell = cell_fn(args.rnn_size) cell = rnn.DropoutWrapper(cell, output_keep_prob=1 - self.dropout) cells.append(cell) self.cell = cell = rnn.MultiRNNCell(cells) '''the model object includes train data, test data if specified, and some batch/epoch pointers''' self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) self.batch_pointer = tf.Variable(0, name="batch_pointer", trainable=False, dtype=tf.int32) self.inc_batch_pointer_op = tf.assign(self.batch_pointer, self.batch_pointer + 1) self.epoch_pointer = tf.Variable(0, name="epoch_pointer", trainable=False) self.batch_time = tf.Variable(0.0, name="batch_time", trainable=False) self.test_x = tf.placeholder(tf.int32, shape=[args.batch_size, args.seq_length]) self.test_y = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) '''i never figured out what this does''' tf.summary.scalar("time_batch", self.batch_time) def variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) #with tf.name_scope('stddev'): # stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) #tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) #tf.summary.histogram('histogram', var) '''begin defining model variables''' with tf.variable_scope('rnnlm'): '''the get_variable is an initializer: here we get weights, then biases''' softmax_w = tf.get_variable( "softmax_w", [args.rnn_size, args.vocab_size], initializer=tf.truncated_normal_initializer(mean=0., stddev=.1, seed=2018, dtype=tf.float32)) variable_summaries(softmax_w) softmax_b = tf.get_variable("softmax_b", [args.vocab_size], initializer=tf.constant_initializer( np.repeat(0., args.vocab_size), tf.float32, args.vocab_size)) variable_summaries(softmax_b) with tf.device("/cpu:0"): '''W will be the word embeddings''' self.W = tf.Variable(tf.constant( 0.0, shape=[args.vocab_size, args.embedding_dim]), name="W") self.embedding_placeholder = tf.placeholder( tf.float32, [args.vocab_size, args.embedding_dim]) self.embedding_init = self.W.assign(self.embedding_placeholder) '''the data to input to the model for some computation''' inputs = tf.split( tf.nn.embedding_lookup(self.W, self.input_data), args.seq_length, 1) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] test_inputs = tf.split( tf.nn.embedding_lookup(self.W, self.test_x), args.seq_length, 1) test_inputs = [ tf.squeeze(test_input_, [1]) for test_input_ in test_inputs ] '''im not 100% on this one, but it never gets used''' def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b print(tf.argmax(prev, 1)) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) print(prev_symbol) return tf.nn.embedding_lookup(embedding, prev_symbol) '''the model output, logits, probability distbution, and loss''' outputs, last_state = legacy_seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.temp = tf.placeholder_with_default(1., shape=()) self.temped_logits = self.logits / self.temp self.probs = tf.nn.softmax(self.temped_logits) loss = legacy_seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length '''the test output, logits, and loss''' test_outputs, test_last_state = legacy_seq2seq.rnn_decoder( test_inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') test_output = tf.reshape(tf.concat(test_outputs, 1), [-1, args.rnn_size]) self.test_logits = tf.matmul(test_output, softmax_w) + softmax_b self.test_probs = tf.nn.softmax(self.test_logits) test_loss = legacy_seq2seq.sequence_loss_by_example( [self.test_logits], [tf.reshape(self.test_y, [-1])], [tf.ones([self.test_y.shape[0]])], args.vocab_size) self.test_cost = tf.reduce_sum( test_loss) / args.batch_size / args.seq_length tf.summary.scalar("cost", self.cost) '''for retrieval of the final states''' self.final_state = last_state self.test_final_state = test_last_state '''the optimizer''' self.lr = tf.Variable(0.0, trainable=False) optimizer = tf.train.AdamOptimizer(self.lr) '''so this was the really hacky way i got the embeddings to be trainable on demand: two channels for optimization. turn whichever you wish''' self.tvars = tf.trainable_variables() self.tvars_no_W = [ var for var in tf.trainable_variables() if "W:0" not in var.name ] grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, self.tvars), args.grad_clip) grads_no_W, _ = tf.clip_by_global_norm( tf.gradients(self.cost, self.tvars_no_W), args.grad_clip) '''running this updates the parameters''' self.train_op = optimizer.apply_gradients(zip(grads, self.tvars)) self.train_op_no_W = optimizer.apply_gradients( zip(grads_no_W, self.tvars_no_W))
X = tf.placeholder(tf.int32, [None, sequence_length]) Y = tf.placeholder(tf.int32, [None, sequence_length]) # One-hot encoding X_one_hot = tf.one_hot(X, num_classes) print(X_one_hot) # check out the shape # Make a lstm cell with hidden_size (each unit output vector size) def lstm_cell(): cell = rnn.BasicLSTMCell(hidden_size, state_is_tuple=True) return cell multi_cells = rnn.MultiRNNCell([lstm_cell() for _ in range(2)], state_is_tuple=True) # outputs: unfolding size x hidden size, state = hidden size outputs, _states = tf.nn.dynamic_rnn(multi_cells, X_one_hot, dtype=tf.float32) # FC layer X_for_fc = tf.reshape(outputs, [-1, hidden_size]) outputs = tf.contrib.layers.fully_connected(X_for_fc, num_classes, activation_fn=None) # reshape out for sequence_loss outputs = tf.reshape(outputs, [batch_size, sequence_length, num_classes]) # All weights are 1 (equal weights) weights = tf.ones([batch_size, sequence_length])
with tf.device('/gpu:0'): input_data = tf.placeholder(tf.int32, shape=[batch_size, num_steps]) target = tf.placeholder(tf.int32, shape=[batch_size, num_steps]) keep_prob = tf.placeholder(tf.float32) embedding = tf.get_variable("embedding", [word_vocab_size, rnn_size]) inputs = tf.nn.embedding_lookup(embedding, input_data) def rnn_cell(): return tf.contrib.rnn.DropoutWrapper(rnn.BasicLSTMCell( num_hidden_units, reuse=False), output_keep_prob=keep_prob, variational_recurrent=True, dtype=tf.float32) cells = rnn.MultiRNNCell( [rnn_cell() for _ in range(num_hidden_layers)]) rnn_initial_state = cells.zero_state(batch_size, dtype=tf.float32) print(input_data.get_shape().as_list()) outputs, final_state = tf.nn.dynamic_rnn( cells, inputs, initial_state=rnn_initial_state, dtype=tf.float32) outputs = tf.reshape(tf.concat(outputs, 1), [-1, rnn_size]) softmax_w = tf.get_variable("softmax_w", [rnn_size, word_vocab_size]) softmax_b = tf.get_variable("softmax_b", [word_vocab_size]) logits = tf.matmul(outputs, softmax_w) + softmax_b logits = tf.reshape(logits, [batch_size, num_steps, word_vocab_size]) loss = tf.contrib.seq2seq.sequence_loss(logits, target, tf.ones(
def network(self): """ RNN 网络搭建 :return: """ # 1. embedding layer with tf.name_scope('embedding'): if self.embedding_mat is None: self.Embedding = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_dims], -1., 1.), name='Embedding') self.embedded_chars = tf.nn.embedding_lookup(self.Embedding, self.input_x) # 2. RNN hidden layer with tf.name_scope('rnn'): if self.cell.startswith("bi"): cell_fw, cell_bw = self.bi_dir_rnn() if self.num_layer > 1: cell_fw = rnn.MultiRNNCell([cell_fw] * self.num_layer, state_is_tuple=True) cell_bw = rnn.MultiRNNCell([cell_bw] * self.num_layer, state_is_tuple=True) outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.embedded_chars, dtype=tf.float32) # 将双向的LSTM 输出拼接,得到[None, time_step, hidden_dims * 2] outputs = tf.concat(outputs, axis=2) else: cells = self.witch_cell() if self.num_layer > 1: cells = rnn.MultiRNNCell([cells] * self.num_layer, state_is_tuple=True) # outputs:[batch, timestep_size, hidden_size] # state:[layer_num, 2, batch_size, hidden_size] outputs, _ = tf.nn.dynamic_rnn(cells, self.embedded_chars, dtype=tf.float32) # 取出最后一个状态的输出 [none, 1, hidden_dims * 2] h_state = outputs[:, -1, :] # 3. FC and softmax layer with tf.name_scope('output'): if self.cell.startswith('bi'): self.W = tf.Variable(tf.truncated_normal([self.hidden_unit * 2, self.num_tags], stddev=0.1), dtype=tf.float32, name='W') else: self.W = tf.Variable(tf.truncated_normal([self.hidden_unit, self.num_tags], stddev=0.1), dtype=tf.float32, name='W') self.b = tf.Variable(tf.constant(0.1, shape=[self.num_tags]), dtype=tf.float32, name='b') # full coneection and softmax output self.logits = tf.nn.softmax(tf.matmul(h_state, self.W) + self.b) # 4. loss with tf.name_scope('loss'): cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y) self.loss = tf.reduce_mean(cross_entropy) # l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() # if 'bias' not in v.name]) * self.l2_reg_lambda self.l2_loss += tf.nn.l2_loss(self.W) self.l2_loss += tf.nn.l2_loss(self.b) self.loss += self.l2_loss # 5. accuracy with tf.name_scope('accuracy'): self.predicted = tf.equal(tf.argmax(self.logits, 1), tf.arg_max(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(self.predicted, dtype=tf.float32)) with tf.name_scope('num_prediction'): self.num_correct = tf.reduce_sum(tf.cast(self.predicted, dtype=tf.float32), name='num_correct')
def __init__(self, args, training=True): self.args = args if not training: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn.BasicRNNCell elif args.model == 'gru': cell_fn = rnn.GRUCell elif args.model == 'lstm': cell_fn = rnn.BasicLSTMCell elif args.model == 'nas': cell_fn = rnn.NASCell else: raise Exception("model type not supported: {}".format(args.model)) cells = [] for _ in range(args.num_layers): cell = cell_fn(args.rnn_size) if training and (args.output_keep_prob < 1.0 or args.input_keep_prob < 1.0): cell = rnn.DropoutWrapper(cell, input_keep_prob=args.input_keep_prob, output_keep_prob=args.output_keep_prob) cells.append(cell) self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True) self.input_data = tf.placeholder( tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder( tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) # dropout beta testing: double check which one should affect next line if training and args.output_keep_prob: inputs = tf.nn.dropout(inputs, args.output_keep_prob) inputs = tf.split(inputs, args.seq_length, 1) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = legacy_seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if not training else None, scope='rnnlm') output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = legacy_seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])]) with tf.name_scope('cost'): self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) with tf.name_scope('optimizer'): optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) # instrument tensorboard tf.summary.histogram('logits', self.logits) tf.summary.histogram('loss', loss) tf.summary.scalar('train_loss', self.cost)
def rnn_estimator(x, y): """RNN estimator with target predictor function on top.""" x = input_op_fn(x) if cell_type == 'rnn': cell_fn = contrib_rnn.BasicRNNCell elif cell_type == 'gru': cell_fn = contrib_rnn.GRUCell elif cell_type == 'lstm': cell_fn = functools.partial(contrib_rnn.BasicLSTMCell, state_is_tuple=False) else: raise ValueError( 'cell_type {} is not supported. '.format(cell_type)) # TODO(ipolosukhin): state_is_tuple=False is deprecated if bidirectional: # forward direction cell fw_cell = lambda: cell_fn(rnn_size) bw_cell = lambda: cell_fn(rnn_size) # attach attention cells if specified if attn_length is not None: def attn_fw_cell(): return contrib_rnn.AttentionCellWrapper( fw_cell(), attn_length=attn_length, attn_size=attn_size, attn_vec_size=attn_vec_size, state_is_tuple=False) def attn_bw_cell(): return contrib_rnn.AttentionCellWrapper( bw_cell(), attn_length=attn_length, attn_size=attn_size, attn_vec_size=attn_vec_size, state_is_tuple=False) else: attn_fw_cell = fw_cell attn_bw_cell = bw_cell rnn_fw_cell = contrib_rnn.MultiRNNCell( [attn_fw_cell() for _ in range(num_layers)], state_is_tuple=False) # backward direction cell rnn_bw_cell = contrib_rnn.MultiRNNCell( [attn_bw_cell() for _ in range(num_layers)], state_is_tuple=False) # pylint: disable=unexpected-keyword-arg, no-value-for-parameter _, encoding = bidirectional_rnn(rnn_fw_cell, rnn_bw_cell, x, dtype=dtypes.float32, sequence_length=sequence_length, initial_state_fw=initial_state, initial_state_bw=initial_state) else: rnn_cell = lambda: cell_fn(rnn_size) if attn_length is not None: def attn_rnn_cell(): return contrib_rnn.AttentionCellWrapper( rnn_cell(), attn_length=attn_length, attn_size=attn_size, attn_vec_size=attn_vec_size, state_is_tuple=False) else: attn_rnn_cell = rnn_cell cell = contrib_rnn.MultiRNNCell( [attn_rnn_cell() for _ in range(num_layers)], state_is_tuple=False) _, encoding = contrib_rnn.static_rnn( cell, x, dtype=dtypes.float32, sequence_length=sequence_length, initial_state=initial_state) return target_predictor_fn(encoding, y)
eth_transaction_fee = tf.constant(0.001) etc_transaction_fee = tf.constant(0.01) SEQLEN = 30 BATCHSIZE = 200 INTERNALSIZE = 512 NLAYERS = 3 learning_rate = 0.001 # inputs/outputs X = tf.placeholder(tf.float32, [None], name="X") Y_ = tf.placeholder(tf.float32, [None], name="Y_") # making the multirnn_gru_cell Hin = tf.placeholder(tf.float32, [None, INTERNALSIZE * NLAYERS], name='Hin') # [ BATCHSIZE, INTERNALSIZE * NLAYERS] cells = [rnn.GRUCell(INTERNALSIZE) for _ in range(NLAYERS)] multicell = rnn.MultiRNNCell(cells, state_is_tuple=False) Yr, H = tf.nn.dynamic_rnn(multicell, X, dtype=tf.float32, initial_state=Hin) H = tf.identity(H, name='H') # checkpoints dir if not os.path.exists("checkpoints"): os.mkdir("checkpoints") saver = tf.train.Saver(max_to_keep=1000) # init # initial zero input state istate = np.zeros([BATCHSIZE, INTERNALSIZE * NLAYERS]) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) step = 0
def add_graph(self, noyear=False, feedforward=False): """ parameters: noyear: a boolean, indicates whether year information is included as input to the model feedforward: a boolean, indicates whether the model is a feedforward neural network or an LSTM Creates a graph for the model. Generates placeholders for X_word, X_year, Y_label, and the embedding matrix. Creates year embedding. Details model architecture. Calculates accuracy, log perplexity, and loss. Optimizes network based on loss. """ # Creates placeholders for LSTM self.X_word = tf.placeholder(tf.int32, [None, MAX_SENT_LENGTH]) self.X_year = tf.placeholder(tf.int32, [None]) self.Y_label = tf.placeholder(tf.int32, [None, MAX_SENT_LENGTH]) self.embedding_matrix = tf.placeholder(tf.float32, [MAX_THRESHOLD, EMBED_DIM]) # Looks up embeddings for each word X_word = tf.nn.embedding_lookup(self.embedding_matrix, self.X_word) # Creates year embedding new_years = tf.subtract(self.X_year, START_YEAR) unembedded_year = tf.tile(tf.expand_dims(new_years, axis=1), [1, MAX_SENT_LENGTH]) self.year_embed_mat = tf.get_variable( name="year_embed_mat", shape=(NUM_YEAR, EMBED_DIM), initializer=tf.contrib.layers.xavier_initializer()) embedded_year = tf.nn.embedding_lookup(self.year_embed_mat, unembedded_year) if noyear: embedded_year = tf.zeros_like(embedded_year) # Concatenates X_word and year embedding to get single combined input X = tf.concat([X_word, embedded_year], axis=2) if feedforward: # Implements Feed-Forward H = tf.layers.dense(inputs=X, units=LAYERS[0], activation=tf.nn.sigmoid) else: # Implements LSTM rnn_layers = [rnn.LSTMCell(size) for size in LAYERS] multi_rnn_cell = rnn.MultiRNNCell(rnn_layers) H, _ = tf.nn.dynamic_rnn(cell=multi_rnn_cell, inputs=X, dtype=tf.float32) # POS tags self.Y = tf.contrib.layers.fully_connected( inputs=H, num_outputs=N_POS, ) # Calculates accuracy equal = tf.equal(tf.cast(tf.argmax(self.Y, axis=2), tf.int32), tf.cast(self.Y_label, tf.int32)) self.acc = tf.reduce_mean(tf.cast(equal, tf.float32)) self.vec_acc = tf.reduce_mean(tf.cast(equal, tf.float32), axis=1) # Calculates perplexity mask = tf.cast(tf.one_hot(self.Y_label, N_POS), tf.float32) p = tf.reduce_sum(tf.nn.softmax(self.Y) * mask, axis=2) self.log_perp = -tf.reduce_sum(tf.log(p), axis=1) / MAX_SENT_LENGTH self.perp = tf.exp(self.log_perp) # Calculates loss self.loss = tf.losses.sparse_softmax_cross_entropy( labels=self.Y_label, logits=self.Y, ) # Sets train_step that uses AdamOptimizer to minimize loss self.train_step = tf.train.AdamOptimizer(LR).minimize(self.loss)
weights={'in':tf.Variable(tf.random_normal([n_input,n_hidden])), 'out': tf.Variable(tf.random_normal([n_hidden,n_outputs]))} biases={'in':tf.Variable(tf.constant(0.1,shape=[n_hidden,])), 'out': tf.Variable(tf.constant(0.1,shape=[n_outputs,]))} X=train_x Y=train_y test=test_x w_in=weights['in'] b_in=biases['in'] inputs=tf.reshape(x,[-1,n_input]) input_rnn=tf.matmul(inputs,w_in)+b_in input_rnn=tf.reshape(input_rnn,[-1,time_step,n_hidden]) lstm_cells=[rnn.LSTMCell(n_hidden,forget_bias=1.0) for _ in range(n_layers)] lstm=rnn.MultiRNNCell(lstm_cells) outputs,states=tf.nn.dynamic_rnn(lstm,inputs=x,dtype=tf.float32,time_major=False) outputs=tf.reshape(outputs,[-1,n_hidden]) w_out=weights['out'] b_out=biases['out'] pred=tf.matmul(outputs,w_out)+b_out #损失函数 loss=tf.reduce_mean(tf.square(tf.reshape(pred,[-1])-tf.reshape(y, [-1]))) train_op=tf.train.AdamOptimizer(learning_rate).minimize(loss) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) #重复训练10000次 for i in range(1000): step=0 start=0 end=start+batch_size
labels = tf.placeholder(tf.float32, shape=(None, 3), name='labels') def feed_dict(X, asp, lx, y): fd = {inputs[t]: X[t] for t in range(dm.max_seq_len)} fd.update({asp_inputs: asp}) fd = {} fd.update({labels: y}) return fd cell = rnn.BasicLSTMCell(cell_num) #cell = rnn.LSTMCell(cell_num, initializer=initializer) #cell = rnn.DropoutWrapper(cell, output_keep_prob=dropout_keep_prob) cells = [deepcopy(cell) for i in range(layer_num)] cell = rnn.MultiRNNCell(cells) with tf.name_scope('embedding'): if dm.use_pretrained_embedding: pre_trained_emb = embedding_frame.dropna().values.astype('float32') pre_trained_embedding = tf.get_variable( name="pre_trained_embedding", shape=pre_trained_emb.shape, initializer=tf.constant_initializer(pre_trained_emb), trainable=True) pad_embedding = tf.get_variable('pad_embedding', (dm.start_idx, embedding_size), dtype=tf.float32, initializer=initializer) embedding = tf.concat([pad_embedding, pre_trained_embedding], axis=0,
def multi_cell(): return rnn.MultiRNNCell( [single_cell() for _ in range(hyper.num_layer)])
l = tf.placeholder(tf.int32, [None]) weights = tf.Variable(tf.random_normal([n_hidden * 2, n_classes])) biases = tf.Variable(tf.random_normal([n_classes])) '''构建Graph''' def GRU_cell(): cell = rnn.GRUCell(n_hidden, reuse=tf.get_variable_scope().reuse) return rnn.DropoutWrapper(cell, output_keep_prob=keep_prob) inputs = tf.transpose(x, [1, 0, 2]) inputs = tf.reshape(inputs, [-1, n_input]) inputs = tf.split(inputs, n_steps) # ** 1.构建前向后向多层 LSTM cell_fw = rnn.MultiRNNCell([GRU_cell() for _ in range(layer_num)], state_is_tuple=True) cell_bw = rnn.MultiRNNCell([GRU_cell() for _ in range(layer_num)], state_is_tuple=True) # ** 2.初始状态+ initial_state_fw = cell_fw.zero_state(batch_size, tf.float32) initial_state_bw = cell_bw.zero_state(batch_size, tf.float32) # ** 3.bi-lstm 计算(tf封装) outputs, _, _ = rnn.static_bidirectional_rnn(cell_fw, cell_bw, inputs, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, dtype=tf.float32, sequence_length=l) output = tf.reshape(tf.concat(outputs, 1), [-1, 2 * n_hidden]) logits = tf.matmul(output, weights) + biases
def main(): # **步骤1:RNN 的输入shape = (n_batch_size, timestep_size, n_input) X = tf.placeholder(tf.float32, shape=(None, n_steps * n_input), name="X") y = tf.placeholder(tf.float32, shape=(None, n_classes), name="y") def lstm_cell(n_hidden, keep_prob): # **步骤2:定义一层 LSTM_cell,只需要说明 n_hidden, 它会自动匹配输入的 X 的维度 cell = rnn.LSTMCell(num_units=n_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2), forget_bias=1.0, state_is_tuple=True) # **步骤3:添加 dropout layer, 一般只设置 output_keep_prob cell = rnn.DropoutWrapper(cell=cell, input_keep_prob=1.0, output_keep_prob=keep_prob) return cell # **步骤4:调用 MultiRNNCell 来实现多层 LSTM cells = [lstm_cell(n_hidden, keep_prob) for _ in range(n_layers)] mlstm_cell = rnn.MultiRNNCell(cells, state_is_tuple=True) # **步骤5:用全零来初始化state # 通过zero_state得到一个全0的初始状态 init_state = mlstm_cell.zero_state(n_batch_size, dtype=tf.float32) # **步骤6:方法一,调用 dynamic_rnn() 来让我们构建好的网络运行起来 # ** 当 time_major==False 时, outputs.shape = [n_batch_size, timestep_size, n_hidden] # ** 所以,可以取 h_state = outputs[:, -1, :] 作为最后输出 # ** state.shape = [layer_num, 2, n_batch_size, n_hidden], # ** 或者,可以取 h_state = state[-1][1] 作为最后输出 # ** 最后输出维度是 [n_batch_size, n_hidden] # outputs, state = tf.nn.dynamic_rnn(mlstm_cell, inputs=X, initial_state=init_state, time_major=False) # h_state = outputs[:, -1, :] # 或者 h_state = state[-1][1] # *************** 为了更好的理解 LSTM 工作原理,我们把上面 步骤6 中的函数自己来实现 *************** # 通过查看文档你会发现, RNNCell 都提供了一个 __call__()函数(见最后附),我们可以用它来展开实现LSTM按时间步迭代。 # **步骤6:方法二,按时间步展开计算 outputs = list() with tf.variable_scope('RNN'): for timestep in range(n_steps): if timestep > 0: tf.get_variable_scope().reuse_variables() # 这里的state保存了每一层 LSTM 的状态 cell_output, h1 = mlstm_cell.call(X, init_state) outputs.append(cell_output) h_state = outputs[-1] # 上面 LSTM 部分的输出会是一个 [n_hidden] 的tensor,我们要分类的话,还需要接一个 softmax 层 # 首先定义 softmax 的连接权重矩阵和偏置 # out_W = tf.placeholder(tf.float32, [n_hidden, n_classes], name='out_Weights') # out_bias = tf.placeholder(tf.float32, [n_classes], name='out_bias') # 开始训练和测试 W = tf.Variable(tf.truncated_normal([n_hidden, n_classes]), dtype=tf.float32) bias = tf.Variable(tf.constant(0.1, shape=[n_classes]), dtype=tf.float32) y_ = tf.nn.softmax(tf.matmul(h_state, W) + bias) # 损失和评估函数 cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=y_)) train_op = tf.train.AdamOptimizer(lr).minimize(cost) correct_prediction = tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1)) acc = tf.reduce_mean(tf.cast(correct_prediction, "float")) with tf.Session() as session: session.run(tf.global_variables_initializer()) for i in range(2000): train_X, train_y = mnist.train.next_batch(n_batch_size) session.run(train_op, feed_dict={ X: train_X, y: train_y, keep_prob: 0.5 }) if (i + 1) % 200 == 0: train_acc, train_loss = session.run([acc, cost], feed_dict={ X: train_X, y: train_y, keep_prob: 1.0 }) # 已经迭代完成的 epoch 数: mnist.train.epochs_completed print("Iter {}, step {}, loss {:6f}, train acc {}".format( mnist.train.epochs_completed, (i + 1), train_loss, train_acc)) print("\nevaluation model") test_X = mnist.test.images[:n_batch_size] test_y = mnist.test.labels[:n_batch_size] # 计算测试数据的准确率 test_acc, test_loss = session.run([acc, cost], feed_dict={ X: test_X, y: test_y, keep_prob: 1.0, batch_size: n_batch_size }) print("test acc {},test loss {}".format(test_acc, test_loss))
def add_rnn(layer_count, hidden_size, cell=rnn.BasicLSTMCell, activation=tf.tanh): # hidden_size = 5,神经元序列 cells = [cell(hidden_size, activation=activation) for _ in range(layer_count)] return rnn.MultiRNNCell(cells)
def __init__(self, params, training=True): if not training: params.batch_size = 1 params.seq_length = 1 cells = [] for _ in range(params.num_layers): cell = rnn.BasicLSTMCell(params.rnn_size) cells.append(cell) self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True) self.input_data = tf.placeholder( tf.int32, [params.batch_size, params.seq_length]) self.targets = tf.placeholder(tf.int32, [params.batch_size, params.seq_length]) self.initial_state = cell.zero_state(params.batch_size, tf.float32) with tf.variable_scope('lstm_lm'): softmax_w = tf.get_variable("softmax_w", [params.rnn_size, params.vocab_size]) softmax_b = tf.get_variable("softmax_b", [params.vocab_size]) embedding = tf.get_variable("embedding", [params.vocab_size, params.rnn_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) inputs = tf.split(inputs, params.seq_length, 1) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = legacy_seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if not training else None, scope='lstm_lm') output = tf.reshape(tf.concat(outputs, 1), [-1, params.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = legacy_seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([params.batch_size * params.seq_length])]) self.cost = (tf.reduce_sum(loss) / params.batch_size) / params.seq_length with tf.name_scope('cost'): self.cost = (tf.reduce_sum(loss) / params.batch_size) / params.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), params.grad_clip) with tf.name_scope('optimizer'): optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) tf.summary.histogram('logits', self.logits) tf.summary.histogram('loss', loss) tf.summary.scalar('train_loss', self.cost)
numInputs = 2 numOutputs = 1 timesteps = 1 resultSet = [] w = tf.Variable(tf.truncated_normal([numHidden2, numOutputs])) b = tf.Variable(tf.random_normal([numOutputs])) lstm = rnn.LSTMCell(numHidden, state_is_tuple=True) lstm2 = rnn.LSTMCell(numHidden2, state_is_tuple=True) lstm3 = rnn.LSTMCell(numHidden2, state_is_tuple=True) cell = rnn.MultiRNNCell([lstm, lstm2]) def LSTM(X): output, state = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32) output = tf.transpose(output, (1, 0, 2)) out = tf.tanh(tf.matmul(output[-1], w) + b) return out X = tf.placeholder(tf.float32, [None, timesteps, numInputs]) Y = tf.placeholder(tf.float32, [None, numOutputs])
def build_stacked_gru_model( embedding_layer, partial_sequence_length, gru_hidden_sizes, num_output_features, bidirectional): """Predicts next production rule from partial sequence with stacked GRUs. Args: embedding_layer: Float32 tensor with shape [batch_size, max_length, num_features]. Input to the model. partial_sequence_length: Int32 tensor with shape [batch_size]. This tensor is used for sequence_length in tf.nn.dynamic_rnn(). gru_hidden_sizes: List of integers, number of units for each GRU layer. num_output_features: Integer, the number of output features. bidirectional: Boolean, whether to use bidirectional RNN. Returns: Float tensor with shape [batch_size, num_output_features] """ with tf.variable_scope('stacked_gru_model'): gru_cells = [ tf.nn.rnn_cell.GRUCell(gru_hidden_size) for gru_hidden_size in gru_hidden_sizes ] forward_stacked_gru = contrib_rnn.MultiRNNCell(gru_cells) if bidirectional: gru_cells = [ tf.nn.rnn_cell.GRUCell(gru_hidden_size) for gru_hidden_size in gru_hidden_sizes ] backward_stacked_gru = contrib_rnn.MultiRNNCell(gru_cells) _, final_states = tf.nn.bidirectional_dynamic_rnn( cell_fw=forward_stacked_gru, cell_bw=backward_stacked_gru, inputs=embedding_layer, sequence_length=partial_sequence_length, dtype=embedding_layer.dtype, time_major=False) # final_states is a tuple of tuples: # ( # (forward_gru_0, forward_gru_1, ...), # (backward_gru_0, backward_gru_1, ...) # ) # Flatten the tuple as # (forward_gru_0, ..., backward_gru_0, ...) final_states = final_states[0] + final_states[1] else: _, final_states = tf.nn.dynamic_rnn( cell=forward_stacked_gru, inputs=embedding_layer, sequence_length=partial_sequence_length, dtype=embedding_layer.dtype, time_major=False) concat_final_states = tf.concat( final_states, axis=1, name='concatenate_gru_final_states') logits = tf.layers.dense( concat_final_states, num_output_features, name='logits') return logits
def get_rnn_cell(hparams=None, mode=None): """Creates an RNN cell. See :func:`~texar.core.default_rnn_cell_hparams` for all hyperparameters and default values. Args: hparams (dict or HParams, optional): Cell hyperparameters. Missing hyperparameters are set to default values. mode (optional): A Tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including `TRAIN`, `EVAL`, and `PREDICT`. If `None`, dropout will be controlled by :func:`texar.global_mode`. Returns: A cell instance. Raises: ValueError: If hparams["num_layers"]>1 and hparams["type"] is a class instance. ValueError: The cell is not an :tf_main:`RNNCell <contrib/rnn/RNNCell>` instance. """ if hparams is None or isinstance(hparams, dict): hparams = HParams(hparams, default_rnn_cell_hparams()) d_hp = hparams["dropout"] if d_hp["variational_recurrent"] and \ len(d_hp["input_size"]) != hparams["num_layers"]: raise ValueError( "If variational_recurrent=True, input_size must be a list of " "num_layers(%d) integers. Got len(input_size)=%d." % (hparams["num_layers"], len(d_hp["input_size"]))) cells = [] cell_kwargs = hparams["kwargs"].todict() num_layers = hparams["num_layers"] for layer_i in range(num_layers): # Create the basic cell cell_type = hparams["type"] if not is_str(cell_type) and not isinstance(cell_type, type): if num_layers > 1: raise ValueError( "If 'num_layers'>1, then 'type' must be a cell class or " "its name/module path, rather than a cell instance.") cell_modules = ['tensorflow.contrib.rnn', 'texar.custom'] cell = utils.check_or_get_instance( cell_type, cell_kwargs, cell_modules, rnn.RNNCell) # Optionally add dropout if d_hp["input_keep_prob"] < 1.0 or \ d_hp["output_keep_prob"] < 1.0 or \ d_hp["state_keep_prob"] < 1.0: vr_kwargs = {} if d_hp["variational_recurrent"]: vr_kwargs = { "variational_recurrent": True, "input_size": d_hp["input_size"][layer_i], "dtype": tf.float32 } input_keep_prob = switch_dropout(d_hp["input_keep_prob"], mode) output_keep_prob = switch_dropout(d_hp["output_keep_prob"], mode) state_keep_prob = switch_dropout(d_hp["state_keep_prob"], mode) cell = rnn.DropoutWrapper( cell=cell, input_keep_prob=input_keep_prob, output_keep_prob=output_keep_prob, state_keep_prob=state_keep_prob, **vr_kwargs) # Optionally add residual and highway connections if layer_i > 0: if hparams["residual"]: cell = rnn.ResidualWrapper(cell) if hparams["highway"]: cell = rnn.HighwayWrapper(cell) cells.append(cell) if hparams["num_layers"] > 1: cell = rnn.MultiRNNCell(cells) else: cell = cells[0] return cell
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn.BasicRNNCell elif args.model == 'gru': cell_fn = rnn.GRUCell elif args.model == 'lstm': cell_fn = rnn.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cells = [] for _ in range(args.num_layers): cell = cell_fn(args.rnn_size) cells.append(cell) self.cell = cell = rnn.MultiRNNCell(cells) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) self.batch_pointer = tf.Variable(0, name="batch_pointer", trainable=False, dtype=tf.int32) self.inc_batch_pointer_op = tf.assign(self.batch_pointer, self.batch_pointer + 1) self.epoch_pointer = tf.Variable(0, name="epoch_pointer", trainable=False) self.batch_time = tf.Variable(0.0, name="batch_time", trainable=False) tf.summary.scalar("time_batch", self.batch_time) def variable_summaries(var): with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) #with tf.name_scope('stddev'): # stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) #tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) #tf.summary.histogram('histogram', var) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) variable_summaries(softmax_w) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) variable_summaries(softmax_b) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split( tf.nn.embedding_lookup(embedding, self.input_data), args.seq_length, 1) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = legacy_seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = legacy_seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length tf.summary.scalar("cost", self.cost) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def getLayeredCell(layer_size, num_units, input_keep_prob, output_keep_prob=1.0): return rnn.MultiRNNCell([rnn.DropoutWrapper(rnn.BasicLSTMCell(num_units), input_keep_prob, output_keep_prob) for i in range(layer_size)])
def main(_): # load data, either shakespeare, or the Python source of Tensorflow itself shakedir = FLAGS.text_dir # shakedir = "../tensorflow/**/*.py" codetext, valitext, bookranges = txt.read_data_files(shakedir, validation=True) # display some stats on the data epoch_size = len(codetext) // (FLAGS.train_batch_size * FLAGS.seqlen) txt.print_data_stats(len(codetext), len(valitext), epoch_size) # # the model (see FAQ in README.md) # lr = tf.placeholder(tf.float32, name='lr') # learning rate pkeep = tf.placeholder(tf.float32, name='pkeep') # dropout parameter batchsize = tf.placeholder(tf.int32, name='batchsize') # inputs X = tf.placeholder(tf.uint8, [None, None], name='X') # [ BATCHSIZE, FLAGS.seqlen ] Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ] # expected outputs = same sequence shifted by 1 since we are trying to predict the next character Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_') # [ BATCHSIZE, FLAGS.seqlen ] Yo_ = tf.one_hot(Y_, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ] # input state Hin = tf.placeholder(tf.float32, [None, INTERNALSIZE * NLAYERS], name='Hin') # [ BATCHSIZE, INTERNALSIZE * NLAYERS] # using a NLAYERS=3 layers of GRU cells, unrolled FLAGS.seqlen=30 times # dynamic_rnn infers FLAGS.seqlen from the size of the inputs Xo onecell = rnn.GRUCell(INTERNALSIZE) dropcell = rnn.DropoutWrapper(onecell, input_keep_prob=pkeep) multicell = rnn.MultiRNNCell([dropcell] * NLAYERS, state_is_tuple=False) multicell = rnn.DropoutWrapper(multicell, output_keep_prob=pkeep) Yr, H = tf.nn.dynamic_rnn(multicell, Xo, dtype=tf.float32, initial_state=Hin) # Yr: [ BATCHSIZE, FLAGS.seqlen, INTERNALSIZE ] # H: [ BATCHSIZE, INTERNALSIZE*NLAYERS ] # this is the last state in the sequence H = tf.identity(H, name='H') # just to give it a name # Softmax layer implementation: # Flatten the first two dimension of the output [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ] => [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ] # then apply softmax readout layer. This way, the weights and biases are shared across unrolled time steps. # From the readout point of view, a value coming from a cell or a minibatch is the same thing Yflat = tf.reshape( Yr, [-1, INTERNALSIZE]) # [ BATCHSIZE x FLAGS.seqlen, INTERNALSIZE ] Ylogits = layers.linear( Yflat, ALPHASIZE) # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ] Yflat_ = tf.reshape( Yo_, [-1, ALPHASIZE]) # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ] loss = tf.nn.softmax_cross_entropy_with_logits( logits=Ylogits, labels=Yflat_) # [ BATCHSIZE x FLAGS.seqlen ] loss = tf.reshape(loss, [batchsize, -1]) # [ BATCHSIZE, FLAGS.seqlen ] Yo = tf.nn.softmax(Ylogits, name='Yo') # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ] Y = tf.argmax(Yo, 1) # [ BATCHSIZE x FLAGS.seqlen ] Y = tf.reshape(Y, [batchsize, -1], name="Y") # [ BATCHSIZE, FLAGS.seqlen ] train_step = tf.train.AdamOptimizer(lr).minimize(loss) # stats for display seqloss = tf.reduce_mean(loss, 1) batchloss = tf.reduce_mean(seqloss) accuracy = tf.reduce_mean( tf.cast(tf.equal(Y_, tf.cast(Y, tf.uint8)), tf.float32)) loss_summary = tf.summary.scalar("batch_loss", batchloss) acc_summary = tf.summary.scalar("batch_accuracy", accuracy) summaries = tf.summary.merge([loss_summary, acc_summary]) # Init Tensorboard stuff. This will save Tensorboard information into a different # folder at each run named 'log/<timestamp>/'. Two sets of data are saved so that # you can compare training and validation curves visually in Tensorboard. timestamp = str(math.trunc(time.time())) summary_writer = tf.summary.FileWriter( os.path.join(FLAGS.summaries_dir, timestamp + "-training")) validation_writer = tf.summary.FileWriter( os.path.join(FLAGS.summaries_dir, timestamp + "-validation")) # Init for saving models. They will be saved into a directory named 'checkpoints'. # Only the last checkpoint is kept. if not os.path.exists(FLAGS.checkpoint_dir): os.mkdir(FLAGS.checkpoint_dir) saver = tf.train.Saver(max_to_keep=1) # for display: init the progress bar DISPLAY_FREQ = 50 _50_BATCHES = DISPLAY_FREQ * FLAGS.train_batch_size * FLAGS.seqlen progress = txt.Progress(DISPLAY_FREQ, size=111 + 2, msg="Training on next " + str(DISPLAY_FREQ) + " batches") # init istate = np.zeros([FLAGS.train_batch_size, INTERNALSIZE * NLAYERS]) # initial zero input state init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) step = 0 # training loop for x, y_, epoch in txt.rnn_minibatch_sequencer(codetext, FLAGS.train_batch_size, FLAGS.seqlen, nb_epochs=1000): # train on one minibatch feed_dict = { X: x, Y_: y_, Hin: istate, lr: FLAGS.learning_rate, pkeep: FLAGS.dropout_pkeep, batchsize: FLAGS.train_batch_size } _, y, ostate, smm = sess.run([train_step, Y, H, summaries], feed_dict=feed_dict) # save training data for Tensorboard summary_writer.add_summary(smm, step) # display a visual validation of progress (every 50 batches) if step % _50_BATCHES == 0: feed_dict = { X: x, Y_: y_, Hin: istate, pkeep: 1.0, batchsize: FLAGS.train_batch_size } # no dropout for validation y, l, bl, acc = sess.run([Y, seqloss, batchloss, accuracy], feed_dict=feed_dict) txt.print_learning_learned_comparison(x, y, l, bookranges, bl, acc, epoch_size, step, epoch) # run a validation step every 50 batches # The validation text should be a single sequence but that's too slow (1s per 1024 chars!), # so we cut it up and batch the pieces (slightly inaccurate) # tested: validating with 5K sequences instead of 1K is only slightly more accurate, but a lot slower. if step % _50_BATCHES == 0 and len(valitext) > 0: VALI_SEQLEN = 1 * 1024 # Sequence length for validation. State will be wrong at the start of each sequence. bsize = len(valitext) // VALI_SEQLEN txt.print_validation_header(len(codetext), bookranges) vali_x, vali_y, _ = next( txt.rnn_minibatch_sequencer(valitext, bsize, VALI_SEQLEN, 1)) # all data in 1 batch vali_nullstate = np.zeros([bsize, INTERNALSIZE * NLAYERS]) feed_dict = { X: vali_x, Y_: vali_y, Hin: vali_nullstate, pkeep: 1.0, # no dropout for validation batchsize: bsize } ls, acc, smm = sess.run([batchloss, accuracy, summaries], feed_dict=feed_dict) txt.print_validation_stats(ls, acc) # save validation data for Tensorboard validation_writer.add_summary(smm, step) # display a short text generated with the current weights and biases (every 150 batches) if step // 3 % _50_BATCHES == 0: txt.print_text_generation_header() ry = np.array([[txt.convert_from_alphabet(ord("K"))]]) rh = np.zeros([1, INTERNALSIZE * NLAYERS]) for k in range(1000): ryo, rh = sess.run([Yo, H], feed_dict={ X: ry, pkeep: 1.0, Hin: rh, batchsize: 1 }) rc = txt.sample_from_probabilities( ryo, topn=10 if epoch <= 1 else 2) print(chr(txt.convert_to_alphabet(rc)), end="") ry = np.array([[rc]]) txt.print_text_generation_footer() # save a checkpoint (every 500 batches) if step // 10 % _50_BATCHES == 0: saver.save(sess, FLAGS.checkpoint_dir + '/rnn_train_' + timestamp, global_step=step) # display progress bar progress.step(reset=step % _50_BATCHES == 0) # loop state around istate = ostate step += FLAGS.train_batch_size * FLAGS.seqlen
def build_graph(self): configs = self.trainingManager.configs # shared between train and test self.keep_prop_tf = tf.placeholder(dtype=tf.float32, name="keep_prop_tf") # repeat it stacked_layers encoder_dropcells_fw = [tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(configs.internal_state_encoder), output_keep_prob=self.keep_prop_tf) for _ in range(configs.stacked_layers)] encoder_multi_cell_fw = rnn.MultiRNNCell(encoder_dropcells_fw, state_is_tuple=True) encoder_dropcells_bw = [tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(configs.internal_state_encoder), output_keep_prob=self.keep_prop_tf) for _ in range(configs.stacked_layers)] encoder_multi_cell_bw = rnn.MultiRNNCell(encoder_dropcells_bw, state_is_tuple=True) decoder_dropcells = [tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(configs.internal_state_decoder), output_keep_prob=self.keep_prop_tf) for _ in range(configs.stacked_layers)] decoder_multi_cell = rnn.MultiRNNCell(decoder_dropcells, state_is_tuple=True) with tf.variable_scope('train'): # input placeholders self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs') # [en_seq_len, batch ] self.encoder_inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_inputs_length') # [batch] self.decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_inputs') # [de_seq_len, batch] starts with KOKO_START token self.decoder_inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='decoder_inputs_length') # [batch] IMPORTANT NOTE : decoder_inputs_length = counts the start token self.decoder_outputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_outputs') # [de_seq_len, batch ] = ground truth padded at the end with zeros # embedding lookup or one hot encoding of my languages (encoder / decoder) if configs.use_embedding: encoder_embeddings = tf.Variable(tf.random_uniform([configs.vocabulary_size_encoder, configs.encoder_embedding_size], -1.0, 1.0), dtype=tf.float32) encoder_inputs_to_rnn = tf.nn.embedding_lookup(encoder_embeddings, self.encoder_inputs) # [ sequence_length,batch_size, encoder_embedding_size ] # embedded decoder_embeddings = tf.Variable(tf.random_uniform([configs.vocabulary_size_decoder, configs.decoder_embedding_size], -1.0, 1.0), dtype=tf.float32) decoder_inputs_to_rnn = tf.nn.embedding_lookup(decoder_embeddings, self.decoder_inputs) # [ sequence_length,batch_size, decoder_embedding_size ] # embedded else: encoder_inputs_to_rnn = tf.one_hot(self.encoder_inputs, configs.vocabulary_size_encoder, 1.0, 0.0) # [ sequence_length,batch_size, vocabulary_size ] # one hot encoded decoder_inputs_to_rnn = tf.one_hot(self.decoder_inputs, configs.vocabulary_size_decoder, 1.0, 0.0) # [ sequence_length,batch_size, vocabulary_size ] # one hot encoded (self.encoder_fw_outputs, self.encoder_bw_outputs), (encoder_fw_final_state, encoder_bw_final_state) = \ tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_multi_cell_fw, cell_bw=encoder_multi_cell_bw, inputs=encoder_inputs_to_rnn, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=True) # outputs :[sequence_length, batch_size, internal_state_encoder] # final_state:[batch_size, internal_state_encoder] as tuple repeated stack times # this is my thought vector = [batch_size, internal_state_encoder(fw)+internal_state_encoder(bw)] # i will re-feed this thought vector at inference self.decoder_init_state_from_encoder = tuple([tf.concat((encoder_fw_final_state[i], encoder_bw_final_state[i]), axis=1) for i in range(configs.stacked_layers)]) # state is tuple for decoder input # decoder dynamic rnn self.decoder_states_outputs, self.decoder_final_state = tf.nn.dynamic_rnn(decoder_multi_cell, inputs=decoder_inputs_to_rnn, initial_state=self.decoder_init_state_from_encoder, time_major=True, sequence_length=self.decoder_inputs_length) # decoder_states_outputs :[sequence_length, batch_size, internal_state_decoder] # decoder_final_state :[batch_size, internal_state_decoder] as tuple repeated stack times decoder_logits = tf.layers.dense(self.decoder_states_outputs, units=configs.vocabulary_size_decoder, use_bias=True) # projection on the vocabulary outputs : [sequence_length, batch_size, vocabulary_size_decoder] self.dec_probabilities = tf.nn.softmax(decoder_logits) # [sequence_length, batch_size, vocabulary_size_decoder] # the triangle has the decoder shape not the encoder !!!! lower_triangular_ones = tf.constant(np.tril(np.ones([configs.max_seq_len_decoder, configs.max_seq_len_decoder])), dtype=tf.float32) # lower triangle ones [max_seq_len_encoder,max_seq_len_encoder] >> [[1. 0.],[1. 1.]] _, batch_size_tf = tf.unstack(tf.shape(self.encoder_inputs)) # seq_length , batch_size seqlen_mask = tf.transpose(tf.slice(tf.gather(lower_triangular_ones, self.decoder_inputs_length - 1), begin=[0, 0], size=[batch_size_tf, tf.reduce_max(self.decoder_inputs_length)])) # so you need to take length -1 due to lower triangle ones [sequence_length, batch_size] # connect outputs to with tf.name_scope("optimization"): # Loss function self.loss = tf.contrib.seq2seq.sequence_loss(decoder_logits, self.decoder_outputs, seqlen_mask) # sparse softmax cross entropy # Optimizer self.train_step = tf.train.RMSPropOptimizer(configs.learning_rate).minimize(self.loss) # To calculate the number correct, this means we don't count the padded as correct correct = tf.cast(tf.equal(tf.cast(tf.argmax(decoder_logits, 2), tf.int32), self.decoder_outputs), dtype=tf.float32) * seqlen_mask self.accuracy = tf.reduce_sum(correct) / tf.reduce_sum(seqlen_mask) # summary tensors if not self.trainingManager.is_local_env: loss_summary = tf.summary.scalar("batch_loss", self.loss) acc_summary = tf.summary.scalar("batch_accuracy", self.accuracy) self.summaries = tf.summary.merge([loss_summary, acc_summary])
o = [0, 0, 0, 1] x_data = np.array([[h, e, l, l, o], [e, o, l, l, l], [l, l, e, e, l]], dtype=np.float32) # with tf.variable_scope('initial_state') as scope: # batch_size = 3 # pp.pprint(x_data) # # # One cell RNN input_dim (4) -> output_dim (5). sequence: 5, batch: 3 # hidden_size = 2 # cell = rnn.BasicLSTMCell(num_units=hidden_size, state_is_tuple=True) # initial_state = cell.zero_state(batch_size, tf.float32) # outputs, _states = tf.nn.dynamic_rnn(cell, x_data, # initial_state=initial_state, dtype=tf.float32) # sess.run(tf.global_variables_initializer()) # pp.pprint(outputs.eval()) with tf.variable_scope('MultiRNNCell') as scope: # Make rnn # cell = rnn.BasicLSTMCell(num_units=5, state_is_tuple=True) def lstm_cell(): cell = rnn.BasicLSTMCell(5, state_is_tuple=True) return cell cells = rnn.MultiRNNCell([lstm_cell() for _ in range(3)], state_is_tuple=True) # 3 layers # print(x_data) # rnn in/out outputs, _states = tf.nn.dynamic_rnn(cells, x_data, dtype=tf.float32) print("dynamic rnn: ", outputs) sess.run(tf.global_variables_initializer()) pp.pprint(outputs.eval()) # batch size, unrolling (time), hidden_size
def __init__(self, params): """ :param params:是一个字典,包含num_steps,state_size,batch_size,num_classes,learning_rate """ self.params = params n_steps = params["n_steps"] n_input = params["n_input"] n_units = params["n_units"] n_classes = params["n_classes"] batch_size = params["batch_size"] # "n_steps": 128, # "n_input": 128, # "n_units": 128, # "n_classes": 6, # "batch_size": 100, # "n_epochs": 50, # "learning_rate": 0.0003, # "display_step": 1, # "run_mode": "/cpu:0", # "split_png_data": "/Users/jw/Desktop/audio_data/1484131952_256_0.5/split_png_data/CASIA" tf.reset_default_graph() with tf.get_default_graph().as_default(): with tf.name_scope("placeholder"): self.x = tf.placeholder("float", [None, n_steps * n_input], name="x") self.input = tf.reshape(self.x, [-1, n_steps, n_input]) self.y = tf.placeholder("float", [None, n_classes], name="y") self.keep_prob = tf.placeholder(tf.float32) with tf.variable_scope("softmax"): weights = tf.Variable(tf.random_normal([n_units, n_classes]), name='weights') biases = tf.Variable(tf.random_normal([n_classes]), name='biases') # x = tf.transpose(self.x, [1, 0, 2]) # x = tf.reshape(x, [-1, n_input]) # x = tf.split(0, n_steps, x) sequence_length = np.zeros([batch_size], dtype=int) sequence_length += n_steps state_size = self.params["n_units"] num_layers = self.params["n_layers"] cell_type = self.params["cell_type"] num_weights_for_custom_cell = self.params.get("n_weights") if cell_type == 'Custom': cell = CustomCell(state_size, num_weights_for_custom_cell) cell = rnn.MultiRNNCell([ rnn.DropoutWrapper(rnn.LSTMCell(state_size, state_is_tuple=True), input_keep_prob=self.keep_prob) for _ in range(num_layers) ]) elif cell_type == 'GRU': cell = rnn.GRUCell(state_size) elif cell_type == 'LSTM': cell = rnn.MultiRNNCell([ rnn.DropoutWrapper(rnn.LSTMCell(state_size, state_is_tuple=True), input_keep_prob=self.keep_prob) for _ in range(num_layers) ]) elif cell_type == 'LN_LSTM': cell = LayerNormalizedLSTMCell(state_size) else: cell = rnn.BasicRNNCell(state_size) cell = rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) self.init_state = cell.zero_state(batch_size, dtype=tf.float32) outputs, self.final_state = tf.nn.dynamic_rnn( cell, self.input, dtype=tf.float32, initial_state=self.init_state, sequence_length=sequence_length) # outputs's shape [batch_size, time_step, state_size] outputs = tf.transpose(outputs, [1, 0, 2]) pred = tf.matmul(outputs[-1], weights) + biases self.cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=self.y)) self.optimizer = tf.train.AdamOptimizer(learning_rate=params['learning_rate']) \ .minimize(self.cost) correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(self.y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) tf.summary.scalar("cost", self.cost) tf.summary.scalar("accuracy", self.accuracy) self.merge_summary_op = tf.summary.merge_all() logger.info("模型构建完毕")
X = tf.reshape(_X, [-1, 28, 28]) y = tf.placeholder(tf.float32, [None, class_num]) keep_prob = tf.placeholder(tf.float32) def unit_lstm(): lstm_cell = rnn.BasicLSTMCell(num_units=hidden_size, forget_bias=1.0, state_is_tuple=True) lstm_cell = rnn.DropoutWrapper(cell=lstm_cell, input_keep_prob=1.0, output_keep_prob=keep_prob) return lstm_cell mlstm_cell = rnn.MultiRNNCell([unit_lstm() for i in range(3)], state_is_tuple=True) init_state = mlstm_cell.zero_state(batch_size, dtype=tf.float32) outputs, state = tf.nn.dynamic_rnn(mlstm_cell, inputs=X, initial_state=init_state, time_major=False) h_state = outputs[:, -1, :] W = tf.Variable(tf.truncated_normal([hidden_size, class_num], stddev=0.1), dtype=tf.float32) bias = tf.Variable(tf.constant(0.1, shape=[class_num]), dtype=tf.float32) y_pre = tf.nn.softmax(tf.matmul(h_state, W) + bias) cross_entropy = -tf.reduce_mean(y * tf.log(y_pre)) train_op = tf.train.AdamOptimizer(lr).minimize(cross_entropy)
time_step = 1 input_num = 8 output_num = 1 epoch_num = 50 batch_size = 72 learning_rate = 0.001 with tf.device('/gpu:0'): x = tf.placeholder("float", [None, time_step, input_num]) y = tf.placeholder("float", [None, output_num]) def lstm_cell(): lstm_cell = rnn.BasicLSTMCell(hidden_num) return lstm_cell with tf.variable_scope("lstm", reuse=None): Multi_cell = rnn.MultiRNNCell([lstm_cell() for _ in range(layer_num)], state_is_tuple=True) outputs, _ = tf.nn.dynamic_rnn(Multi_cell, x, dtype=tf.float32) prediction = tf.layers.dense(inputs=outputs[:, -1, :], units=output_num) loss = tf.reduce_mean(tf.abs(prediction - y)) train_step = tf.train.AdamOptimizer().minimize(loss) init = tf.global_variables_initializer() with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess: sess.run(init) loss_epoch = [0] * epoch_num xx = 0 n = 0 pre = [] rmse = 0 start_train = time.time()
#'cnnoutscale': tf.Variable(tf.ones([2048])), #'featbeta': tf.Variable(tf.zeros([4096])), #'featscale': tf.Variable(tf.ones([4096])), #'gbeta': tf.Variable(tf.zeros([1000])), #'gscale': tf.Variable(tf.ones([1000])) } # question-embedding #embed_ques_W = tf.Variable(tf.random_uniform([vocabulary_size, input_embedding_size], -0.08, 0.08), name='embed_ques_W') # encoder: RNN body lstm_1 = rnn_cell.LSTMCell(rnn_size, input_embedding_size, use_peepholes=True, state_is_tuple=False) lstm_dropout_1 = rnn_cell.DropoutWrapper(lstm_1, output_keep_prob = 1 - dropout_rate) lstm_2 = rnn_cell.LSTMCell(rnn_size, rnn_size, use_peepholes=True, state_is_tuple=False) lstm_dropout_2 = rnn_cell.DropoutWrapper(lstm_2, output_keep_prob = 1 - dropout_rate) stacked_lstm = rnn_cell.MultiRNNCell([lstm_dropout_1, lstm_dropout_2], state_is_tuple=False) image = tf.placeholder(tf.float32, [batch_size, 2048]) question = tf.placeholder(tf.int32, [batch_size, max_words_q]) #answers_true = tf.placeholder(tf.float32, (batch_size, 1000)) #noise = tf.placeholder(tf.float32, [batch_size, 4096]) #answers_false = tf.placeholder(tf.float32, (None, 1000)) #image_false = tf.placeholder(tf.float32, (None, 2048)) #question_false = tf.placeholder(tf.int32, [batch_size, max_words_q]) #state = tf.zeros([batch_size, stacked_lstm.state_size]) state = stacked_lstm.zero_state(batch_size, tf.float32) loss = 0.0
def __init__(self, num_emb, batch_size, emb_dim, hidden_dim, sequence_length, start_token, learning_rate=0.01, reward_gamma=0.95): self.num_emb = num_emb self.batch_size = batch_size self.emb_dim = emb_dim self.hidden_dim = hidden_dim self.sequence_length = sequence_length self.start_token = tf.constant([start_token] * self.batch_size, dtype=tf.int32) self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.reward_gamma = reward_gamma self.temperature = 1.0 self.grad_clip = 5.0 self.expected_reward = tf.Variable(tf.zeros([self.sequence_length])) with tf.variable_scope('generator') as scope: self.g_embeddings = tf.Variable( self.init_matrix([self.num_emb, self.emb_dim])) self.g_recurrent_unit = self.create_recurrent_unit( ) # maps h_tm1 to h_t for generator self.g_output_unit = self.create_output_unit( ) # maps h_t to o_t (output token logits) # placeholder definition self.x = tf.placeholder( tf.int32, shape=[self.batch_size, self.sequence_length]) # sequence of indices of true data, not including start token self.rewards = tf.placeholder( tf.float32, shape=[self.batch_size, self.sequence_length]) # get from rollout policy and discriminator # processed for batch with tf.device("/cpu:0"): inputs = tf.split(axis=1, num_or_size_splits=self.sequence_length, value=tf.nn.embedding_lookup( self.g_embeddings, self.x)) self.processed_x = tf.stack([ tf.squeeze(input_, [1]) for input_ in inputs ]) # seq_length x batch_size x emb_dim cell = rnn.BasicLSTMCell(self.hidden_dim, state_is_tuple=True) self.cell = rnn.MultiRNNCell([cell] * 2, state_is_tuple=True) self.h0 = tf.zeros([self.batch_size, self.hidden_dim]) self.h0 = tf.stack([self.h0, self.h0]) self.h0 = self.cell.zero_state(self.batch_size, tf.float32) gen_o = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True) gen_x = tensor_array_ops.TensorArray(dtype=tf.int32, size=self.sequence_length, dynamic_size=False, infer_shape=True) def _g_recurrence(x_t, h_tm1, gen_o, gen_x): h_t = self.g_recurrent_unit(x_t, h_tm1) # hidden_memory_tuple o_t = self.g_output_unit( h_t) # batch x vocab , logits not prob log_prob = tf.log(tf.nn.softmax(o_t)) next_token = tf.cast( tf.reshape(tf.multinomial(log_prob, 1), [self.batch_size]), tf.int32) x_tp1 = tf.nn.embedding_lookup(self.g_embeddings, next_token) # batch x emb_dim gen_o = gen_o.write(i, tf.reduce_sum( tf.multiply( tf.one_hot(next_token, self.num_emb, 1.0, 0.0), tf.nn.softmax(o_t)), 1)) # [batch_size] , prob gen_x = gen_x.write(i, next_token) # indices, batch_size return x_tp1, h_t, gen_o, gen_x # My loop initial_state = (tf.zeros([self.batch_size, self.hidden_dim]), self.h0) x_t, h_t, gen_o, gen_x = tf.nn.embedding_lookup( self.g_embeddings, self.start_token), initial_state, gen_o, gen_x for i in range(self.sequence_length): if i > 0: scope.reuse_variables() x_t, h_t, gen_o, gen_x = _g_recurrence(x_t, h_t, gen_o, gen_x) self.gen_o, self.gen_x = gen_o, gen_x self.gen_x = self.gen_x.stack() # seq_length x batch_size self.gen_x = tf.transpose(self.gen_x, perm=[1, 0]) # batch_size x seq_length # supervised pretraining for generator g_predictions = tensor_array_ops.TensorArray( dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True) g_logits = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True) ta_emb_x = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length) ta_emb_x = ta_emb_x.unstack(self.processed_x) def _pretrain_recurrence(x_t, h_tm1, g_predictions, g_logits): h_t = self.g_recurrent_unit(x_t, h_tm1) o_t = self.g_output_unit(h_t) g_predictions = g_predictions.write( i, tf.nn.softmax(o_t)) # batch x vocab_size g_logits = g_logits.write(i, o_t) # batch x vocab_size x_tp1 = ta_emb_x.read(i) return x_tp1, h_t, g_predictions, g_logits initial_state = (tf.zeros([self.batch_size, self.hidden_dim]), self.h0) x_t, ht = tf.nn.embedding_lookup(self.g_embeddings, self.start_token), initial_state for i in range(self.sequence_length): if i > 0: scope.reuse_variables() x_t, h_t, g_predictions, g_logits = _pretrain_recurrence( x_t, h_t, g_predictions, g_logits) self.g_predictions, self.g_logits = g_predictions, g_logits self.g_predictions = tf.transpose( self.g_predictions.stack(), perm=[1, 0, 2]) # batch_size x seq_length x vocab_size self.g_logits = tf.transpose( self.g_logits.stack(), perm=[1, 0, 2]) # batch_size x seq_length x vocab_size # pretraining loss self.pretrain_loss = -tf.reduce_sum( tf.one_hot(tf.to_int32(tf.reshape( self.x, [-1])), self.num_emb, 1.0, 0.0) * tf.log( tf.clip_by_value( tf.reshape(self.g_predictions, [-1, self.num_emb]), 1e-20, 1.0))) / (self.sequence_length * self.batch_size) # training updates pretrain_opt = self.g_optimizer(self.learning_rate) tvars = tf.trainable_variables() g_params = [var for var in tvars if 'generator' in var.name] self.pretrain_grad, _ = tf.clip_by_global_norm( tf.gradients(self.pretrain_loss, g_params), self.grad_clip) self.pretrain_updates = pretrain_opt.apply_gradients( zip(self.pretrain_grad, g_params)) ####################################################################################################### # Unsupervised Training ####################################################################################################### self.g_loss = -tf.reduce_sum( tf.reduce_sum( tf.one_hot(tf.to_int32(tf.reshape( self.x, [-1])), self.num_emb, 1.0, 0.0) * tf.log( tf.clip_by_value( tf.reshape(self.g_predictions, [-1, self.num_emb]), 1e-20, 1.0)), 1) * tf.reshape(self.rewards, [-1])) g_opt = self.g_optimizer(self.learning_rate) self.g_grad, _ = tf.clip_by_global_norm( tf.gradients(self.g_loss, g_params), self.grad_clip) self.g_updates = g_opt.apply_gradients(zip(self.g_grad, g_params))
def __init__(self, encoder_inputs, encoder_lengths, encoder_inputs_2, encoder_lengths_2, decoder_inputs, decoder_lengths, _embed_ph, learn_rate, start_token, if_test=False, temperature=None, end_rate=1): self.start_tokens = tf.constant([start_token] * BATCH_SIZE, dtype=tf.int32) self.learn_rate = learn_rate keep_prob = 0.8 with tf.variable_scope('generator', initializer=tf.orthogonal_initializer()): with tf.variable_scope('embedding'): self.embedding = tf.get_variable( name='embedding', shape=[param.VOCAB_SIZE, INPUT_DIM], trainable=True) self._embed_ph = _embed_ph self._embed_init = self.embedding.assign(self._embed_ph) self.encoder_inputs = encoder_inputs self.encoder_lengths = encoder_lengths if param.Use_VAE: self.encoder_inputs_2 = encoder_inputs_2 self.encoder_lengths_2 = encoder_lengths_2 self.decoder_lengths = decoder_lengths self.decoder_inputs = decoder_inputs[:, :-1] max_len = tf.shape(decoder_inputs)[-1] with tf.variable_scope('cell', initializer=xavier_initializer()): self.encoder_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(_NUM_UNITS, activation=tf.tanh) for _ in range(param.NUM_LAYERS) ]) self.encoder_init_state = self.encoder_cell.zero_state( BATCH_SIZE, dtype=tf.float32) _, self.encoder_final_state = tf.nn.dynamic_rnn( cell=self.encoder_cell, initial_state=self.encoder_init_state, inputs=self.encoder_inputs, sequence_length=self.encoder_lengths, scope='encoder') if param.Use_VAE: self.encoder_cell_2 = rnn.MultiRNNCell([ rnn.BasicLSTMCell(_NUM_UNITS, activation=tf.tanh) for _ in range(param.NUM_LAYERS) ]) self.encoder_init_state_2 = self.encoder_cell.zero_state( BATCH_SIZE, dtype=tf.float32) _, self.encoder_final_state_2 = tf.nn.dynamic_rnn( cell=self.encoder_cell_2, initial_state=self.encoder_init_state_2, inputs=tf.nn.embedding_lookup(self.embedding, self.decoder_inputs), sequence_length=self.decoder_lengths, scope='encoder_2') self.decoder_cell = rnn.MultiRNNCell([ rnn.BasicLSTMCell(_NUM_UNITS) for _ in range(param.NUM_LAYERS) ]) self.decoder_cell_drop = tf.contrib.rnn.DropoutWrapper( self.decoder_cell, output_keep_prob=keep_prob) self.decoder_init_state = self.decoder_cell_drop.zero_state( BATCH_SIZE, dtype=tf.float32) if param.Use_VAE: self.decoder_input_state_2 = self.encoder_final_state_2 # self.decoder_input_state = (LSTMStateTuple(self.decoder_input_state[0,0],self.decoder_input_state[0,1]), # LSTMStateTuple(self.decoder_input_state[1,0],self.decoder_input_state[1,1])) self.decoder_input_state = self.encoder_final_state if Use_relational_memory: g_output_unit = create_output_unit( _NUM_UNITS * NUM_LAYERS * 2, param.VOCAB_SIZE) mem_slots = param.mem_slots head_size = param.head_size num_heads = param.num_heads gen_mem = RelationalMemory(mem_slots=mem_slots, head_size=head_size, num_heads=num_heads) self.decoder_inputs = tf.pad(self.decoder_inputs, [[0, 0], [0, 1]]) x_emb = tf.transpose( tf.nn.embedding_lookup(self.embedding, self.decoder_inputs), perm=[1, 0, 2]) # seq_len x batch_size x emb_dim g_predictions = tensor_array_ops.TensorArray( dtype=tf.float32, size=max_len, dynamic_size=True, infer_shape=True) g_predictions = g_predictions.write( 0, tf.one_hot(self.decoder_inputs[:, 0], param.VOCAB_SIZE)) ta_emb_x = tensor_array_ops.TensorArray(dtype=tf.float32, size=max_len) ta_emb_x = ta_emb_x.unstack(x_emb) with tf.variable_scope('postprocessing', initializer=xavier_initializer()): if Post_with_state: self.softmax_w = tf.get_variable( 'softmax_w', [ head_size * num_heads + 2 * NUM_UNITS, param.VOCAB_SIZE ]) else: self.softmax_w = tf.get_variable( 'softmax_w', [head_size * num_heads, param.VOCAB_SIZE]) self.softmax_b = tf.get_variable( 'softmax_b', [param.VOCAB_SIZE]) # the generator recurrent moddule used for pre-training def _pretrain_recurrence(i, x_t, h_tm1, g_predictions): mem_o_t, h_t = gen_mem(x_t, h_tm1) if Post_with_state: mem_o_t = tf.concat([ tf.reshape(mem_o_t, [-1, head_size * num_heads]), self.decoder_input_state[:, 0, :2 * NUM_UNITS] ], -1) else: mem_o_t = tf.reshape(mem_o_t, [-1, head_size * num_heads]) o_t = tf.nn.bias_add(tf.matmul(mem_o_t, self.softmax_w), bias=self.softmax_b) #o_t = g_output_unit(mem_o_t) g_predictions = g_predictions.write( i, o_t) # batch_size x vocab_size x_tp1 = ta_emb_x.read(i) return i + 1, x_tp1, h_t, g_predictions self.decoder_input_state = tf.convert_to_tensor( self.decoder_input_state) self.decoder_input_state = tf.transpose( self.decoder_input_state, perm=[2, 0, 1, 3]) self.decoder_input_state = tf.reshape( self.decoder_input_state, [BATCH_SIZE, mem_slots, -1]) if param.Use_latent_z: self.decoder_input_state = tf.concat([ tf.truncated_normal( shape=self.decoder_input_state.shape), self.decoder_input_state ], axis=-1) elif Use_VAE: self.decoder_input_state_2 = tf.convert_to_tensor( self.decoder_input_state_2) self.decoder_input_state_2 = tf.transpose( self.decoder_input_state_2, perm=[2, 0, 1, 3]) self.decoder_input_state_2 = tf.reshape( self.decoder_input_state_2, [BATCH_SIZE, -1]) self.mn = tf.layers.dense(self.decoder_input_state_2, units=NUM_UNITS * 2) self.sd = 0.5 * tf.layers.dense( self.decoder_input_state_2, units=NUM_UNITS * 2) epsilon = tf.random_normal( tf.stack([ tf.shape(self.decoder_input_state_2)[0], NUM_UNITS * 2 ])) self.decoder_input_state_2 = self.mn + tf.multiply( epsilon, tf.exp(self.sd)) self.decoder_input_state_2 = tf.reshape( self.decoder_input_state_2, [BATCH_SIZE, mem_slots, -1]) self.decoder_input_state = tf.concat([ self.decoder_input_state_2, self.decoder_input_state ], axis=-1) # build a graph for outputting sequential tokens _, _, self.decoder_final_state, self.outputs = control_flow_ops.while_loop( cond=lambda i, _1, _2, _3: i < max_len, body=_pretrain_recurrence, loop_vars=(tf.constant(1, dtype=tf.int32), tf.nn.embedding_lookup( self.embedding, self.decoder_inputs[:, 0]), self.decoder_input_state, g_predictions)) self.logits = tf.transpose(self.outputs.stack()[1:, :, :], perm=[1, 0, 2]) else: self.outputs, self.decoder_final_state = tf.nn.dynamic_rnn( cell=self.decoder_cell_drop, initial_state=self.decoder_input_state, #初始状态,h inputs=tf.nn.embedding_lookup( self.embedding, self.decoder_inputs), #输入x sequence_length=self.decoder_lengths, dtype=tf.float32, scope='decoder') #这里直接用softmax if Use_relational_memory: pass else: # self.logits = g_output_unit(tf.reshape(self.outputs, [-1, _NUM_UNITS])) with tf.variable_scope('cell/postprocessing', initializer=xavier_initializer()): self.softmax_w = tf.get_variable( 'softmax_w', [_NUM_UNITS, param.VOCAB_SIZE]) self.softmax_b = tf.get_variable('softmax_b', [param.VOCAB_SIZE]) self.logits = tf.nn.bias_add(tf.matmul( tf.reshape(self.outputs, [-1, _NUM_UNITS]), self.softmax_w), bias=self.softmax_b) self.probs = tf.reshape( tf.nn.softmax(self.logits), [BATCH_SIZE, -1, param.VOCAB_SIZE]) #输出应该是一个one_shot向量 self.labels = tf.one_hot(decoder_inputs[:, 1:], depth=param.VOCAB_SIZE, dtype=tf.int32) self.right_count = tf.reduce_sum( tf.reduce_sum( tf.multiply( tf.one_hot(tf.argmax(self.probs, -1), param.VOCAB_SIZE), tf.to_float(self.labels)), -1), -1) / tf.to_float(max_len) self.logits = tf.reshape(self.logits, [BATCH_SIZE, -1, param.VOCAB_SIZE]) loss = get_loss(LOSS_TYPE, self.logits, self.labels, self.decoder_lengths, BATCH_SIZE) self.loss = loss self.original_loss = get_loss(1, self.logits, self.labels, self.decoder_lengths, BATCH_SIZE) # ---------- generate tokens and approximated one-hot results (Adversarial) --------- gen_o = tensor_array_ops.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, infer_shape=True) #the prob gen_x = tensor_array_ops.TensorArray( dtype=tf.int32, size=0, dynamic_size=True, infer_shape=True) # sampled token gen_x_onehot_adv = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True, infer_shape=True) # generator output (relaxed of gen_x) random_start_length = tf.constant(param.start_length, dtype=tf.int32) # random_start_length = tf.random_uniform(shape = [],minval=0,maxval=sentence_min_len,dtype=tf.int32) def _start_recurrence(word_i, gen_o, gen_x, gen_x_onehot_adv): gen_x = gen_x.write(word_i, decoder_inputs[:, word_i]) gen_o = gen_o.write( word_i, tf.one_hot(decoder_inputs[:, word_i], param.VOCAB_SIZE, 1.0, 0.0)) gen_x_onehot_adv = gen_x_onehot_adv.write( word_i, tf.one_hot(decoder_inputs[:, word_i], param.VOCAB_SIZE, 1.0, 0.0) * 1000000) return word_i + 1, gen_o, gen_x, gen_x_onehot_adv _, gen_o, gen_x, gen_x_onehot_adv = control_flow_ops.while_loop( cond=lambda i, _1, _2, _3: i < random_start_length + 1, body=_start_recurrence, loop_vars=(tf.constant(0), gen_o, gen_x, gen_x_onehot_adv)) #temperature = param.temperature # the generator recurrent module used for adversarial training if Use_relational_memory: with tf.variable_scope('cell/postprocessing', reuse=True): if Post_with_state: self.softmax_w = tf.get_variable( 'softmax_w', [ param.head_size * param.num_heads + 2 * NUM_UNITS, param.VOCAB_SIZE ]) else: self.softmax_w = tf.get_variable( 'softmax_w', [ param.head_size * param.num_heads, param.VOCAB_SIZE ]) self.softmax_b = tf.get_variable('softmax_b', [param.VOCAB_SIZE]) if param.start_length >= 1: x_emb = tf.transpose(tf.nn.embedding_lookup( self.embedding, self.decoder_inputs), perm=[1, 0, 2]) ta_emb_gen_x = tensor_array_ops.TensorArray( dtype=tf.float32, size=max_len) ta_emb_gen_x = ta_emb_gen_x.unstack(x_emb) # the generator recurrent moddule used for pre-training def _start_rel_recurrence(i, x_t, h_tm1): mem_o_t, h_t = gen_mem(x_t, h_tm1) if Post_with_state: mem_o_t = tf.concat([ tf.reshape(mem_o_t, [-1, head_size * num_heads]), self.decoder_input_state[:, 0, :2 * NUM_UNITS] ], -1) else: mem_o_t = tf.reshape(mem_o_t, [-1, head_size * num_heads]) o_t = tf.nn.bias_add(tf.matmul(mem_o_t, self.softmax_w), bias=self.softmax_b) x_tp1 = ta_emb_gen_x.read(i) return i + 1, x_tp1, h_t self.decoder_start_word_state = \ tf.cond(random_start_length > 0, lambda: (control_flow_ops.while_loop( cond=lambda i, _1, _2: i < random_start_length + 1, body=_start_rel_recurrence, loop_vars=( tf.constant(1, dtype=tf.int32), tf.nn.embedding_lookup(self.embedding, self.decoder_inputs[:,0]), self.decoder_input_state)))[2], lambda: self.decoder_input_state) # _, _, self.decoder_start_word_state = control_flow_ops.while_loop( # cond=lambda i, _1, _2: i < random_start_length + 1, # body=_start_rel_recurrence, # loop_vars=( # tf.constant(1, dtype=tf.int32), # tf.nn.embedding_lookup(self.embedding, self.decoder_inputs[:,0]), # self.decoder_input_state)) else: self.decoder_start_word_state = self.decoder_input_state else: with tf.variable_scope('cell', reuse=True): _, self.decoder_start_word_state = tf.nn.dynamic_rnn( cell=self.decoder_cell_drop, initial_state=self.encoder_final_state, # 初始状态,h inputs=tf.nn.embedding_lookup( self.embedding, self.decoder_inputs), # 输入x sequence_length=np.ones(BATCH_SIZE) * random_start_length, dtype=tf.float32, scope='decoder') def _gen_recurrence(i, x_t, state, gen_o, gen_x, gen_x_onehot_adv): if Use_relational_memory: mem_o_t, state = gen_mem(x_t, state) # hidden_memory_tuple if Post_with_state: mem_o_t = tf.concat([ tf.reshape(mem_o_t, [-1, head_size * num_heads]), self.decoder_input_state[:, 0, :2 * NUM_UNITS] ], -1) else: mem_o_t = tf.reshape(mem_o_t, [-1, head_size * num_heads]) logits = tf.nn.bias_add(tf.matmul(mem_o_t, self.softmax_w), bias=self.softmax_b) pad = np.ones((BATCH_SIZE, VOCAB_SIZE)) pad_2 = tf.one_hot(tf.to_int32( tf.constant(np.ones((BATCH_SIZE)) * 2)), depth=VOCAB_SIZE) pad_2 = tf.multiply(pad_2, end_rate - 1) pad = tf.constant(pad) pad = tf.add(tf.to_float(pad), tf.to_float(pad_2)) logits = tf.multiply(logits, pad) else: with tf.variable_scope('cell', reuse=True): outputs, state = rnn.static_rnn( cell=self.decoder_cell_drop, initial_state=state, inputs=[x_t], #输入x sequence_length=np.ones(BATCH_SIZE), dtype=tf.float32, scope='decoder') with tf.variable_scope('postprocessing', reuse=True): logits = tf.nn.bias_add(tf.matmul( tf.reshape(outputs, [-1, _NUM_UNITS]), self.softmax_w), bias=self.softmax_b) # logits = g_output_unit(tf.reshape(outputs, [-1, _NUM_UNITS])) prob = tf.reshape( tf.nn.softmax(logits), [BATCH_SIZE, param.VOCAB_SIZE]) #without length if not if_test: gumbel_t = add_gumbel(logits) else: # gumbel_t = logits gumbel_t = add_gumbel(tf.multiply(1.2, logits)) next_token = tf.to_int32( tf.stop_gradient(tf.argmax(gumbel_t, axis=1))) next_token_onehot = tf.one_hot(next_token, param.VOCAB_SIZE, 1.0, 0.0) x_onehot_appr = tf.multiply( gumbel_t, temperature) # one-hot-like, [batch_size x vocab_size] gen_o = gen_o.write(i, logits) gen_x = gen_x.write(i, next_token) gen_x_onehot_adv = gen_x_onehot_adv.write( i, tf.nn.softmax(x_onehot_appr)) x_tp1 = tf.nn.embedding_lookup(self.embedding, next_token) return i + 1, x_tp1, state, gen_o, gen_x, gen_x_onehot_adv # build a graph for outputting sequential tokens _, _, _, self.gen_o, self.gen_x, self.gen_x_onehot_adv = control_flow_ops.while_loop( cond=lambda i, _1, _2, _3, _4, _5: i < max_len, body=_gen_recurrence, loop_vars=(random_start_length + 1, tf.nn.embedding_lookup( self.embedding, decoder_inputs[:, random_start_length]), self.decoder_start_word_state, gen_o, gen_x, gen_x_onehot_adv)) self.gen_o = tf.transpose( self.gen_o.stack(), perm=[1, 0, 2]) # batch_size x seq_len x vocab_size self.gen_x = tf.transpose(self.gen_x.stack(), perm=[1, 0]) self.gen_x_onehot_adv = tf.transpose(self.gen_x_onehot_adv.stack(), perm=[1, 0, 2]) temp_list = tf.constant(list(range(100))) temp_list = temp_list[:max_len] - 1 temp_list = tf.tile(tf.reshape(temp_list, [1, max_len]), [BATCH_SIZE, 1]) self.ifequal = tf.equal(tf.to_int32(tf.argmax(self.gen_o, -1)), tf.ones_like(self.gen_x)) self.ifequal = tf.to_int32(self.ifequal[:, :-1]) self.ifequal = tf.concat( [self.ifequal, tf.ones((BATCH_SIZE, 1), tf.int32)], -1) self.total_length = tf.multiply( tf.to_int32(self.ifequal), temp_list) + 10000 * (1 - tf.to_int32(self.ifequal)) self.gen_x_length = tf.reduce_mean( tf.to_float(tf.reduce_min(self.total_length, -1)))
def __init__(self, args, infer=False): ''' Initialisation function for the class Model. Params: args: Contains arguments required for the Model creation ''' # If sampling new trajectories, then infer mode if infer: # Infer one position at a time args.batch_size = 1 args.seq_length = 1 # Store the arguments self.args = args # TODO: (resolve) Do we need to use a fixed seq_length? # Input data contains sequence of (x,y) points self.input_data = tf.placeholder(tf.float32, [None, args.seq_length, 2]) # target data contains sequences of (x,y) points as well self.target_data = tf.placeholder(tf.float32, [None, args.seq_length, 2]) # fraction of nodes to drop when running the graph self.dropout = tf.placeholder(tf.float32) # Learning rate self.lr = tf.Variable(args.learning_rate, trainable=False, name="learning_rate") cells = [] # loop through once for each layer of nodes for _ in range(args.num_layers): # Initialize a BasicLSTMCell recurrent unit # args.rnn_size contains the dimension of the hidden state of the LSTM cell = rnn.BasicLSTMCell(args.rnn_size, state_is_tuple=True) # Add dropout for training normalization cell = rnn.DropoutWrapper(cell, output_keep_prob=1.0 - self.dropout) cells.append(cell) cell = rnn.MultiRNNCell(cells, state_is_tuple=True) # Store the recurrent unit self.cell = cell # Initial cell state of the LSTM (initialised with zeros) self.initial_state = cell.zero_state(batch_size=args.batch_size, dtype=tf.float32) # Output size is the set of parameters (mu, sigma, corr) output_size = 5 # 2 mu, 2 sigma and 1 corr # Embedding for the spatial coordinates with tf.variable_scope("coordinate_embedding"): # The spatial embedding using a ReLU layer # Embed the 2D coordinates into embedding_size dimensions # TODO: (improve) For now assume embedding_size = rnn_size embedding_w = tf.get_variable("embedding_w", [2, args.embedding_size]) embedding_b = tf.get_variable("embedding_b", [args.embedding_size]) # Output linear layer with tf.variable_scope("rnnlm"): output_w = tf.get_variable( "output_w", [args.rnn_size, output_size], initializer=tf.truncated_normal_initializer(stddev=0.01), trainable=True) output_b = tf.get_variable( "output_b", [output_size], initializer=tf.constant_initializer(0.01), trainable=True) # Split inputs according to sequences. ## inputs = tf.split(1, args.seq_length, self.input_data) inputs = tf.split(self.input_data, args.seq_length, 1) # Get a list of 2D tensors. Each of size numPoints x 2 inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # Embed the input spatial points into the embedding space embedded_inputs = [] for x in inputs: # Each x is a 2D tensor of size numPoints x 2 # Embedding layer embedded_x = tf.nn.relu( tf.add(tf.matmul(x, embedding_w), embedding_b)) embedded_inputs.append(embedded_x) # Feed the embedded input data, the initial state of the LSTM cell, the recurrent unit to the seq2seq decoder ## outputs, last_state = tf.nn.seq2seq.rnn_decoder(embedded_inputs, self.initial_state, cell, loop_function=None, scope="rnnlm") outputs, last_state = tf.contrib.legacy_seq2seq.rnn_decoder( embedded_inputs, self.initial_state, cell, loop_function=None, scope="rnnlm") # outputs, last_state = tf.nn.dynamic_rnn(cell, embedded_inputs, initial_state=self.initial_state, scope="rnnlm") # Concatenate the outputs from the RNN decoder and reshape it to ?xargs.rnn_size ## output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size]) # Apply the output linear layer output = tf.nn.xw_plus_b(output, output_w, output_b) # Store the final LSTM cell state after the input data has been feeded self.final_state = last_state # reshape target data so that it aligns with predictions flat_target_data = tf.reshape(self.target_data, [-1, 2]) # Extract the x-coordinates and y-coordinates from the target data ## [x_data, y_data] = tf.split(1, 2, flat_target_data) [x_data, y_data] = tf.split(flat_target_data, 2, 1) def tf_2d_normal(x, y, mux, muy, sx, sy, rho): ''' Function that implements the PDF of a 2D normal distribution params: x : input x points y : input y points mux : mean of the distribution in x muy : mean of the distribution in y sx : std dev of the distribution in x sy : std dev of the distribution in y rho : Correlation factor of the distribution ''' # eq 3 in the paper # and eq 24 & 25 in Graves (2013) # Calculate (x - mux) and (y-muy) ## normx = tf.sub(x, mux) ## normy = tf.sub(y, muy) normx = tf.subtract(x, mux) normy = tf.subtract(y, muy) # Calculate sx*sy ## sxsy = tf.mul(sx, sy) sxsy = tf.multiply(sx, sy) # Calculate the exponential factor ## z = tf.square(tf.div(normx, sx)) + tf.square(tf.div(normy, sy)) - 2*tf.div(tf.mul(rho, tf.mul(normx, normy)), sxsy) z = tf.square(tf.div(normx, sx)) + tf.square(tf.div(normy, sy)) -\ 2 * tf.div(tf.multiply(rho, tf.multiply(normx, normy)), sxsy) negRho = 1 - tf.square(rho) # Numerator result = tf.exp(tf.div(-z, 2 * negRho)) # Normalization constant ## denom = 2 * np.pi * tf.mul(sxsy, tf.sqrt(negRho)) denom = 2 * np.pi * tf.multiply(sxsy, tf.sqrt(negRho)) # Final PDF calculation result = tf.div(result, denom) self.result = result return result # Important difference between loss func of Social LSTM and Graves (2013) # is that it is evaluated over all time steps in the latter whereas it is # done from t_obs+1 to t_pred in the former def get_lossfunc(z_mux, z_muy, z_sx, z_sy, z_corr, x_data, y_data): ''' Function to calculate given a 2D distribution over x and y, and target data of observed x and y points params: z_mux : mean of the distribution in x z_muy : mean of the distribution in y z_sx : std dev of the distribution in x z_sy : std dev of the distribution in y z_rho : Correlation factor of the distribution x_data : target x points y_data : target y points ''' # step = tf.constant(1e-3, dtype=tf.float32, shape=(1, 1)) # Calculate the PDF of the data w.r.t to the distribution result0 = tf_2d_normal(x_data, y_data, z_mux, z_muy, z_sx, z_sy, z_corr) # result0_2 = tf_2d_normal(tf.add(x_data, step), y_data, z_mux, z_muy, z_sx, z_sy, z_corr) # result0_3 = tf_2d_normal(x_data, tf.add(y_data, step), z_mux, z_muy, z_sx, z_sy, z_corr) # result0_4 = tf_2d_normal(tf.add(x_data, step), tf.add(y_data, step), z_mux, z_muy, z_sx, z_sy, z_corr) # result0 = tf.div(tf.add(tf.add(tf.add(result0_1, result0_2), result0_3), result0_4), tf.constant(4.0, dtype=tf.float32, shape=(1, 1))) # result0 = tf.mul(tf.mul(result0, step), step) # For numerical stability purposes epsilon = 1e-20 # TODO: (resolve) I don't think we need this as we don't have the inner # summation # result1 = tf.reduce_sum(result0, 1, keep_dims=True) # Apply the log operation result1 = -tf.log(tf.maximum(result0, epsilon)) # Numerical stability # TODO: For now, implementing loss func over all time-steps # Sum up all log probabilities for each data point return tf.reduce_sum(result1) def get_coef(output): # eq 20 -> 22 of Graves (2013) # TODO : (resolve) Does Social LSTM paper do this as well? # the paper says otherwise but this is essential as we cannot # have negative standard deviation and correlation needs to be between # -1 and 1 z = output # Split the output into 5 parts corresponding to means, std devs and corr ## z_mux, z_muy, z_sx, z_sy, z_corr = tf.split(1, 5, z) z_mux, z_muy, z_sx, z_sy, z_corr = tf.split(z, 5, 1) # The output must be exponentiated for the std devs z_sx = tf.exp(z_sx) z_sy = tf.exp(z_sy) # Tanh applied to keep it in the range [-1, 1] z_corr = tf.tanh(z_corr) return [z_mux, z_muy, z_sx, z_sy, z_corr] # Extract the coef from the output of the linear layer [o_mux, o_muy, o_sx, o_sy, o_corr] = get_coef(output) # Store the output from the model self.output = output # Store the predicted outputs self.mux = o_mux self.muy = o_muy self.sx = o_sx self.sy = o_sy self.corr = o_corr # Compute the loss function lossfunc = get_lossfunc(o_mux, o_muy, o_sx, o_sy, o_corr, x_data, y_data) # Compute the cost self.cost = tf.div(lossfunc, (args.batch_size * args.seq_length)) # Get trainable_variables tvars = tf.trainable_variables() # L2 loss l2 = args.lambda_param * sum(tf.nn.l2_loss(tvar) for tvar in tvars) self.cost = self.cost + l2 # TODO: (resolve) We are clipping the gradients as is usually done in LSTM # implementations. Social LSTM paper doesn't mention about this at all # Calculate gradients of the cost w.r.t all the trainable variables self.gradients = tf.gradients(self.cost, tvars) # Clip the gradients if they are larger than the value given in args grads, _ = tf.clip_by_global_norm(self.gradients, args.grad_clip) # NOTE: Using RMSprop as suggested by Social LSTM instead of Adam as Graves(2013) does # optimizer = tf.train.AdamOptimizer(self.lr) # initialize the optimizer with teh given learning rate optimizer = tf.train.RMSPropOptimizer(self.lr) # Train operator self.train_op = optimizer.apply_gradients(zip(grads, tvars))