def __init__(self, rnn_size, rnn_layer, batch_size, input_embedding_size, dim_image, dim_hidden, max_words_q, vocabulary_size, drop_out_rate): self.rnn_size = rnn_size self.rnn_layer = rnn_layer self.batch_size = batch_size self.input_embedding_size = input_embedding_size self.dim_image = dim_image self.dim_hidden = dim_hidden self.max_words_q = max_words_q self.vocabulary_size = vocabulary_size self.drop_out_rate = drop_out_rate # question-embedding self.embed_ques_W = tf.Variable(tf.random_uniform([self.vocabulary_size, self.input_embedding_size], -0.08, 0.08), name='embed_ques_W') # encoder: RNN body self.lstm_1 = core_rnn_cell.LSTMCell(rnn_size, input_embedding_size, use_peepholes=True, state_is_tuple = False) self.lstm_dropout_1 = core_rnn_cell.DropoutWrapper(self.lstm_1, output_keep_prob = 1 - self.drop_out_rate) self.lstm_2 = core_rnn_cell.LSTMCell(rnn_size, rnn_size, use_peepholes=True, state_is_tuple = False) self.lstm_dropout_2 = core_rnn_cell.DropoutWrapper(self.lstm_2, output_keep_prob = 1 - self.drop_out_rate) self.stacked_lstm = core_rnn_cell.MultiRNNCell([self.lstm_dropout_1, self.lstm_dropout_2], state_is_tuple = False) # state-embedding self.embed_state_W = tf.Variable(tf.random_uniform([2*rnn_size*rnn_layer, self.dim_hidden], -0.08,0.08),name='embed_state_W') self.embed_state_b = tf.Variable(tf.random_uniform([self.dim_hidden], -0.08, 0.08), name='embed_state_b') # image-embedding self.embed_image_W = tf.Variable(tf.random_uniform([dim_image, self.dim_hidden], -0.08, 0.08), name='embed_image_W') self.embed_image_b = tf.Variable(tf.random_uniform([dim_hidden], -0.08, 0.08), name='embed_image_b') # score-embedding self.embed_scor_W = tf.Variable(tf.random_uniform([dim_hidden, num_output], -0.08, 0.08), name='embed_scor_W') self.embed_scor_b = tf.Variable(tf.random_uniform([num_output], -0.08, 0.08), name='embed_scor_b')
def __init__(self, isTraining, enc_attribs): """Initializer for encoder class. Args: isTraining: Whether the network is in training mode or not. This would affect whether dropout is used or not. enc_attribs: A dictionary of attributes used by encoder like: hidden_size: Hidden size of LSTM cell used for encoding num_layers: Number of hidden layers used vocab_size: Vocabulary size of input symbols emb_size: Embedding size used to feed in input symbols out_prob(Optional): (1 - Dropout probability) """ self.isTraining = isTraining # Update the parameters self.__dict__.update(enc_attribs) # Create the LSTM cell using the hidden size attribute self.cell = rnn_cell.BasicLSTMCell(self.hidden_size, state_is_tuple=True) if self.isTraining: # During training a dropout wrapper is used self.cell = rnn_cell.DropoutWrapper(self.cell, output_keep_prob=self.out_prob) if self.num_layers > 1: self.cell = rnn_cell.MultiRNNCell([self.cell] * self.num_layers, state_is_tuple=True)
def __init__(self, args, data, infer=False): if infer: args.batch_size = 1 args.seq_length = 1 with tf.name_scope('inputs'): self.input_data = tf.placeholder( tf.int32, [args.batch_size, args.seq_length]) self.target_data = tf.placeholder( tf.int32, [args.batch_size, args.seq_length]) with tf.name_scope('model'): self.cell = rnn_cell.BasicLSTMCell(args.state_size) self.cell = rnn_cell.MultiRNNCell([self.cell] * args.num_layers) self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): w = tf.get_variable('softmax_w', [args.state_size, data.vocab_size]) b = tf.get_variable('softmax_b', [data.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable( 'embedding', [data.vocab_size, args.state_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) outputs, last_state = tf.nn.dynamic_rnn( self.cell, inputs, initial_state=self.initial_state) with tf.name_scope('loss'): output = tf.reshape(outputs, [-1, args.state_size]) self.logits = tf.matmul(output, w) + b self.probs = tf.nn.softmax(self.logits) self.last_state = last_state targets = tf.reshape(self.target_data, [-1]) loss = seq2seq.sequence_loss_by_example( [self.logits], [targets], [tf.ones_like(targets, dtype=tf.float32)]) self.cost = tf.reduce_sum(loss) / args.batch_size tf.summary.scalar('loss', self.cost) with tf.name_scope('optimize'): self.lr = tf.placeholder(tf.float32, []) tf.summary.scalar('learning_rate', self.lr) optimizer = tf.train.AdamOptimizer(self.lr) tvars = tf.trainable_variables() grads = tf.gradients(self.cost, tvars) for g in grads: tf.summary.histogram(g.name, g) grads, _ = tf.clip_by_global_norm(grads, args.grad_clip) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.merged_op = tf.summary.merge_all()
def set_cell_config(self): """Create the LSTM cell used by decoder.""" # Use the BasicLSTMCell - https://arxiv.org/pdf/1409.2329.pdf cell = rnn_cell.BasicLSTMCell(self.hidden_size, state_is_tuple=True) if self.isTraining: # During training we use a dropout wrapper cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=self.out_prob) if self.num_layers > 1: # If RNN is stacked then we use MultiRNNCell class cell = rnn_cell.MultiRNNCell([cell] * self.num_layers, state_is_tuple=True) # Use the OutputProjectionWrapper to project cell output to output # vocab size. This projection is fine for a small vocabulary output # but would be bad for large vocabulary output spaces. cell = rnn_cell.OutputProjectionWrapper(cell, self.vocab_size) return cell
def __init__(self, configs, data, infer=False): if infer: configs.batch_size = 1 configs.seq_length = 1 self.input_data = tf.placeholder(tf.int32, [configs.batch_size, configs.seq_length]) self.target_data = tf.placeholder(tf.int32, [configs.batch_size, configs.seq_length]) self.lr = tf.placeholder(tf.float32, []) #cell definition self.cell = rnn.BasicLSTMCell(configs.state_size) self.cell = rnn.MultiRNNCell([self.cell] * configs.num_layers) self.initial_state = self.cell.zero_state(configs.batch_size, tf.float32) # para definitions w = tf.get_variable('softmax_w', [configs.state_size, data.vocab_size]) b = tf.get_variable('softmax_b', [data.vocab_size]) #embedding embedding = tf.get_variable('embedding', [data.vocab_size, configs.state_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) #output output, last_state = tf.nn.dynamic_rnn(self.cell, inputs, initial_state=self.initial_state) output_new = tf.reshape(output, [-1, configs.state_size]) #logit computation self.logits = tf.matmul(output_new, w) + b self.probs = tf.nn.softmax(self.logits) self.last_state = last_state #comparison target = tf.reshape(self.target_data, [-1]) loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([self.logits], [target], [tf.ones_like(target, dtype=tf.float32)]) self.cost = tf.reduce_sum(loss) / configs.batch_size #optimizer optimizer = tf.train.AdamOptimizer(self.lr) tvars = tf.trainable_variables() grads = tf.gradients(self.cost, tvars) grads, _ = tf.clip_by_global_norm(grads, configs.grad_clip) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, config, training=False): self.config = config self.time_batch_len = time_batch_len = config.time_batch_len self.input_dim = input_dim = config.input_dim hidden_size = config.hidden_size num_layers = config.num_layers dropout_prob = config.dropout_prob input_dropout_prob = config.input_dropout_prob cell_type = config.cell_type self.seq_input = \ tf.placeholder(tf.float32, shape=[self.time_batch_len, None, input_dim]) if (dropout_prob <= 0.0 or dropout_prob > 1.0): raise Exception("Invalid dropout probability: {}".format(dropout_prob)) if (input_dropout_prob <= 0.0 or input_dropout_prob > 1.0): raise Exception("Invalid input dropout probability: {}".format(input_dropout_prob)) # setup variables with tf.variable_scope("rnnlstm"): output_W = tf.get_variable("output_w", [hidden_size, input_dim]) output_b = tf.get_variable("output_b", [input_dim]) self.lr = tf.constant(config.learning_rate, name="learning_rate") self.lr_decay = tf.constant(config.learning_rate_decay, name="learning_rate_decay") def create_cell(input_size): if cell_type == "vanilla": cell_class = rnn_cell.BasicRNNCell elif cell_type == "gru": cell_class = rnn_cell.BasicGRUCell elif cell_type == "lstm": cell_class = rnn_cell.BasicLSTMCell else: raise Exception("Invalid cell type: {}".format(cell_type)) cell = cell_class(hidden_size, input_size = input_size) if training: return rnn_cell.DropoutWrapper(cell, output_keep_prob = dropout_prob) else: return cell if training: self.seq_input_dropout = tf.nn.dropout(self.seq_input, keep_prob = input_dropout_prob) else: self.seq_input_dropout = self.seq_input self.cell = rnn_cell.MultiRNNCell( [create_cell(input_dim)] + [create_cell(hidden_size) for i in range(1, num_layers)]) batch_size = tf.shape(self.seq_input_dropout)[0] self.initial_state = self.cell.zero_state(batch_size, tf.float32) inputs_list = tf.unstack(self.seq_input_dropout) # rnn outputs a list of [batch_size x H] outputs outputs_list, self.final_state = rnn.static_rnn(self.cell, inputs_list, initial_state=self.initial_state) outputs = tf.stack(outputs_list) outputs_concat = tf.reshape(outputs, [-1, hidden_size]) logits_concat = tf.matmul(outputs_concat, output_W) + output_b logits = tf.reshape(logits_concat, [self.time_batch_len, -1, input_dim]) # probabilities of each note self.probs = self.calculate_probs(logits) self.loss = self.init_loss(logits, logits_concat) self.train_step = tf.train.RMSPropOptimizer(self.lr, decay = self.lr_decay) \ .minimize(self.loss)
def rnn(hnum1, hnum2, hnum3): file = open(r"wind.csv") file.readline() # 读掉第一行,下次再引用file的时候,将file的文件指针指向第二行开始的文件. reader = csv.reader(file) raw_data = [] for date, hors, u, v, ws, wd in reader: if ws != 'NA': raw_data.append(float(ws)) # difference = max(raw_data)-min(raw_data) # raw_data = [i/difference for i in raw_data] # raw_data=[10*math.sin(0.1*i) for i in range(20000)] sequence_length = 100 # 代表以往数据的移动窗口宽度 注意取值范围 (可调参数) predict_length = 16 # 代表预测数据的移动窗口宽度 即一次预测出的结果数 注意取值范围 (可调参数) train_input_all = [] for i in range(0, len(raw_data[0:-sequence_length - predict_length + 1])): temp_list = [] for j in range(sequence_length): temp_list.append([raw_data[i + j]]) train_input_all.append(temp_list) train_label_all = [] train_label_all1 = [] for i in range(sequence_length, len(raw_data) - predict_length + 1): temp_list = [] for j in range(predict_length): temp_list.append(raw_data[i + j]) train_label_all.append(temp_list) train_label_all1.append(raw_data[i + j]) seperate_point = 5000 # 测试集与训练集分割点 (可调数) test_point = 90000 # 使用的数据量大小(可调数 且必须大于seperate_point) test_point_start = 80000 train_input = train_input_all[0:seperate_point] test_input = train_input_all[test_point_start + 1:test_point] train_output = train_label_all[0:seperate_point] # 训练数据标签格式1 train_output1 = train_label_all1[0:seperate_point] # 训练数据标签格式2 test_output = train_label_all[test_point_start + 1:test_point] # 测试数据标签格式1 test_output1 = train_label_all1[test_point_start + 1:test_point] # 测试数据标签格式2 # 打乱训练集 index = [i for i in range(len(train_input))] shuffle(index) train_input = [train_input[index[i]] for i in range(len(index))] train_output = [train_output[index[i]] for i in range(len(index))] data = tf.placeholder( tf.float32, [None, sequence_length, 1]) # batch_size maxtime deepth target = tf.placeholder(tf.float32, [None, predict_length], name='target') num_hidden = [hnum1, hnum2, hnum3] # 隐含层数量(可调参数) # cell = rnn_cell.BasicRNNCell(num_hidden) # cells = rnn_cell.LSTMCell(num_hidden[0], state_is_tuple=True) cell_layer1 = rnn_cell.LSTMCell(num_hidden[0], state_is_tuple=True) # cell_layer1 = rnn_cell.DropoutWrapper(cell_layer1, input_keep_prob=0.5, output_keep_prob=0.5) cell_layer2 = rnn_cell.LSTMCell(num_hidden[1], state_is_tuple=True) # cell_layer2 = rnn_cell.DropoutWrapper(cell_layer2, input_keep_prob=0.5, output_keep_prob=0.5) cell_layer3 = rnn_cell.LSTMCell(num_hidden[2], state_is_tuple=True) # cell_layer4 = rnn_cell.LSTMCell(num_hidden[3], state_is_tuple=True) # cell_layer5 = rnn_cell.LSTMCell(num_hidden[4], state_is_tuple=True) cells = rnn_cell.MultiRNNCell([cell_layer1, cell_layer2, cell_layer3]) # 建立多层rnn val, state = tf.nn.dynamic_rnn(cells, data, dtype=tf.float32) val = tf.transpose(val, [1, 0, 2]) val_shape = val.get_shape() last = tf.gather(val, int(val.get_shape()[0]) - 1) last_shape = last.get_shape() weight = tf.Variable( tf.truncated_normal([num_hidden[-1], int(target.get_shape()[1])])) bias = tf.Variable(tf.constant(0.1, shape=[target.get_shape()[1]])) prediction = tf.matmul(last, weight) + bias prediction_shape = prediction.get_shape() loss = tf.reduce_mean(tf.square(prediction - target)) loss_shape = loss.get_shape() optimizer = tf.train.AdamOptimizer() minimize = optimizer.minimize(loss) # mistakes = tf.not_equal(tf.argmax(target, 1), tf.argmax(prediction, 1)) error = tf.reduce_mean(tf.square(prediction - target)) error_sep = tf.square(prediction - target) # 计算每一个预测分量的误差 init_op = tf.global_variables_initializer() sess = tf.Session() saver = tf.train.Saver() sess.run(init_op) # 在这里,可以执行这个语句,也可以不执行,即使执行了,初始化的值也会被restore的值给override # saver.restore(sess, r"parameter_5.ckpt") batch_size = 10 # (可调参数) no_of_batches = int(len(train_input) / batch_size) epoch = 25 # (可调参数) total_error1 = 0 predict_result1 = [] total_error = 0 predict_error = [] predict_result = [] temp = 0 # 测试变量 for i in range(epoch): ptr = 0 for j in range(no_of_batches): inp, out = train_input[ptr:ptr + batch_size], train_output[ptr:ptr + batch_size] ptr += batch_size sess.run(minimize, {data: inp, target: out}) print("Epoch - ", str(i)) # sess.run(error, {data: train_input, target: train_output}) # 观察测试样本的估计情况 total_error = 0 predict_error = [] predict_result = [] temp = 0 temp1 = 0 temp_sep = [] total_error_sep = [] for i in range(len(test_input)): inp, out = test_input[i:i + 1], test_output[i:i + 1] temp1 = sess.run(prediction, {data: inp, target: out}) temp = sess.run(error, {data: inp, target: out}) temp_sep = sess.run(error_sep, {data: inp, target: out}) # print(temp1) total_error += temp # predict_error.append(temp) predict_result.append(temp1[0]) total_error_sep.append(temp_sep) total_error /= len(test_input) total_error = math.sqrt(total_error) total_error_sep = (np.array(total_error_sep)).mean(axis=0) # 观察训练样本的训练情况 total_error1 = 0 predict_result1 = [] temp2 = 0 temp3 = 0 temp_sep1 = [] total_error_sep1 = [] for i in range(len(train_input)): inp, out = train_input[i:i + 1], train_output[i:i + 1] temp2 = sess.run(error, {data: inp, target: out}) temp3 = sess.run(prediction, {data: inp, target: out}) temp_sep1 = sess.run(error_sep, {data: inp, target: out}) total_error1 += temp2 predict_result1.append((temp3[0])) total_error_sep1.append(temp_sep1) total_error1 /= len(train_input) total_error1 = math.sqrt(total_error1) total_error_sep1 = (np.array(total_error_sep1)).mean(axis=0) # incorrect = sess.run(error, {data: test_input, target: test_output}) print('Epoch {:2d} error {:3.5f}'.format(i + 1, total_error)) # print('predict_error') # print(predict_error) # print('predict_result') # print(predict_result) saver.save(sess, r"parameter_5.ckpt") sess.close() # saver = tf.train.Saver() # pylab.plot(predict_result)# predict_result1是测试样本的检擦结果predict_result # pylab.plot(test_output) # pylab.plot(predict_result1)# predict_result1是训练样本的检擦结果predict_result # pylab.plot(train_output)#train_output1是训练样本的检查结果test_output1 corrcoef_result_test = [] corrcoef_result_train = [] for cursor in range(16): test_output_single = [ test_output[i][int(cursor)] for i in range(len(test_output)) ] predict_result_single = [ predict_result[i][int(cursor)] for i in range(len(predict_result)) ] corrcoef_result_test.append( corrcoef(test_output_single, predict_result_single)) #print(corrcoef(test_output_single, predict_result_single)) for cursor in range(16): train_output_single = [ train_output[i][int(cursor)] for i in range(len(train_output)) ] predict_result1_single = [ predict_result1[i][int(cursor)] for i in range(len(predict_result1)) ] corrcoef_result_train.append( corrcoef(train_output_single, predict_result1_single)) #print(corrcoef(train_output_single, predict_result1_single)) ''' 需要记录的数据 1输入训练样本编号 2测试样本编号 3使用的模型类型 4使用的模型参数(隐含层数量 隐含层每层的单元数量)5训练batch大小 6训练的周期数 7预测结果 8输出误差大小 9输出的相关系数 10训练时间 ''' csvfile = open(r'short_result5.csv', 'a') writer = csv.writer(csvfile) writer.writerow([ 'epoch', 'seperate_point', 'test_point_start', 'test_point', 'num_hidden', 'sequence_length', 'predict_length', 'batch_size' ]) data = [(epoch, seperate_point, test_point_start, test_point, num_hidden, sequence_length, predict_length, batch_size)] writer.writerows(data) writer.writerow(['corrcoef_result_test']) writer.writerow(corrcoef_result_test) writer.writerow(['corrcoef_result_train']) writer.writerow(corrcoef_result_train) writer.writerow(['prediction_result']) writer.writerow(['total_error_sep']) for i in range(len(total_error_sep)): writer.writerow(total_error_sep[i]) writer.writerow(['total_error']) writer.writerow([total_error]) csvfile.close()
''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '' data = tf.placeholder(tf.float32, [None, sequence_length, 1]) #batch_size maxtime deepth target = tf.placeholder(tf.float32, [None, predict_length], name='target') num_hidden = [30, 30] #隐含层数量(可调参数) #cell = rnn_cell.BasicRNNCell(num_hidden) #cells = rnn_cell.LSTMCell(num_hidden[0], state_is_tuple=True) cell_layer1 = rnn_cell.LSTMCell(num_hidden[0], state_is_tuple=True) #cell_layer1 = rnn_cell.DropoutWrapper(cell_layer1, input_keep_prob=0.5, output_keep_prob=0.5) cell_layer2 = rnn_cell.LSTMCell(num_hidden[1], state_is_tuple=True) #cell_layer2 = rnn_cell.DropoutWrapper(cell_layer2, input_keep_prob=0.5, output_keep_prob=0.5) #cell_layer3 = rnn_cell.LSTMCell(num_hidden[2], state_is_tuple=True) #cell_layer4 = rnn_cell.LSTMCell(num_hidden[3], state_is_tuple=True) #cell_layer5 = rnn_cell.LSTMCell(num_hidden[4], state_is_tuple=True) cells = rnn_cell.MultiRNNCell([cell_layer1, cell_layer2]) #建立多层rnn val, state = tf.nn.dynamic_rnn(cells, data, dtype=tf.float32) val = tf.transpose(val, [1, 0, 2]) val_shape = val.get_shape() last = tf.gather(val, int(val.get_shape()[0]) - 1) last_shape = last.get_shape() weight = tf.Variable( tf.truncated_normal([num_hidden[-1], int(target.get_shape()[1])])) bias = tf.Variable(tf.constant(0.1, shape=[target.get_shape()[1]])) #weight_shape = weight.get_shape() #bias_shape = bias.get_shape()
def __init__(self, args, infer=False): # infer is set to true during sampling. self.args = args if infer: # Worry about one character at a time during sampling; no batching or BPTT. args.batch_size = 1 args.seq_length = 1 # Set cell_fn to the type of network cell we're creating -- RNN, GRU or LSTM. if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) # Call tensorflow library tensorflow-master/tensorflow/python/ops/rnn_cell # to create a layer of rnn_size cells of the specified basic type (RNN/GRU/LSTM). if args.model == "gru": cell = cell_fn(args.rnn_size) else: cell = cell_fn(args.rnn_size, state_is_tuple=True) # Use the same rnn_cell library to create a stack of these cells # of num_layers layers. Pass in a python list of these cells. # (The [cell] * arg.num_layers syntax literally duplicates cell multiple times in # a list. The syntax is such that [5, 6] * 3 would return [5, 6, 5, 6, 5, 6].) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) # Create two TF placeholder nodes of 32-bit ints (NOT floats!), # each of shape batch_size x seq_length. This shape matches the batches # (listed in x_batches and y_batches) constructed in create_batches in utils.py. # input_data will receive input batches, and targets will be what it compares against # to calculate loss. self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) # Using the zero_state function in the RNNCell master class in rnn_cell library, # create a tensor of zeros such that we can swap it in for the network state at any time # to zero out the network's state. # State dimensions are: cell_fn state size (2 for LSTM) x rnn_size x num_layers. # So an LSTM network with 100 cells per layer and 3 layers would have a state size of 600, # and initial_state would have a dimension of none x 600. self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) # Scope our new variables to the scope identifier string "rnnlm". with tf.variable_scope('rnnlm'): # Create new variable softmax_w and softmax_b for output. # softmax_w is a weights matrix from the top layer of the model (of size rnn_size) # to the vocabulary output (of size vocab_size). softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) # softmax_b is a bias vector of the ouput characters (of size vocab_size). softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) # [TODO: Why specify CPU? Same as the TF translation tutorial, but don't know why.] with tf.device("/cpu:0"): # Create new variable named 'embedding' to connect the character input to the base layer # of the RNN. Its role is the conceptual inverse of softmax_w. # It contains the trainable weights from the one-hot input vector to the lowest layer of RNN. embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) # Create an embedding tensor with tf.nn.embedding_lookup(embedding, self.input_data). # This tensor has dimensions batch_size x seq_length x rnn_size. # tf.split splits that embedding lookup tensor into seq_length tensors (along dimension 1). # Thus inputs is a list of seq_length different tensors, # each of dimension batch_size x 1 x rnn_size. inputs = tf.split(tf.nn.embedding_lookup( embedding, self.input_data), args.seq_length, axis=1) # Iterate through these resulting tensors and eliminate that degenerate second dimension of 1, # i.e. squeeze each from batch_size x 1 x rnn_size down to batch_size x rnn_size. # Thus we now have a list of seq_length tensors, each with dimension batch_size x rnn_size. inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # THIS LOOP FUNCTION IS NEVER ACTUALLY USED. # IT IS EXPLICITLY NOT USED DURING TRAINING. # DURING INFERENCE, SEQ_LENGTH == 1, SO SEQ2SEQ.RNN_DECODER() ONLY USES THE LOOP ARGUMENT # ON SEQUENCE LENGTH ITEMS SUBSEQUENT TO THE FIRST. # This looping function is used as part of seq2seq.rnn_decoder only during sampling -- not training. # prev is a 2D Tensor of shape [batch_size x cell.output_size]. # returns a 2D Tensor of shape [batch_size x cell.input_size]. def loop(prev, _): # prev is initially the top cell state. # Convert the top cell state into character logits. prev = tf.matmul(prev, softmax_w) + softmax_b # Pull the character with the greatest logit (no sampling, just argmaxing). # WHY IS THIS ARGMAXING WHEN ACTUAL SAMPLING IS DONE PROBABILISTICALLY? # DOESN'T THIS CAUSE OUTPUTS NOT TO MATCH INPUTS DURING SEQUENCE GENERATION? prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) # Re-embed that symbol as the next step's input, and return that. return tf.nn.embedding_lookup(embedding, prev_symbol) # Set up a seq2seq decoder from the seq2seq.py library. # This constructs the outputs and states nodes of the network. # Outputs is a list (of len seq_length, same as inputs) of tensors of shape [batch_size x rnn_size]. # These are the raw output values of the top layer of the network at each time step. # They have NOT been fed through the decoder projection; they are still in network space, # not character space. # State is a tensor of shape [batch_size x cell.state_size]. # This is also the step where all of the trainable parameters for the LSTM (weights and biases) are defined. outputs, self.final_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') # tf.concat concatenates the output tensors along the rnn_size dimension, # to make a single tensor of shape [batch_size x (seq_length * rnn_size)]. # This gives the following 2D outputs matrix: # [(rnn output: batch 0, seq 0) (rnn output: batch 0, seq 1) ... (rnn output: batch 0, seq seq_len-1)] # [(rnn output: batch 1, seq 0) (rnn output: batch 1, seq 1) ... (rnn output: batch 1, seq seq_len-1)] # ... # [(rnn output: batch batch_size-1, seq 0) (rnn output: batch batch_size-1, seq 1) ... (rnn output: batch batch_size-1, seq seq_len-1)] # tf.reshape then reshapes it to a tensor of shape [(batch_size * seq_length) x rnn_size]. # Output will now be the following matrix: # [rnn output: batch 0, seq 0] # [rnn output: batch 0, seq 1] # ... # [rnn output: batch 0, seq seq_len-1] # [rnn output: batch 1, seq 0] # [rnn output: batch 1, seq 1] # ... # [rnn output: batch 1, seq seq_len-1] # ... # ... # [rnn output: batch batch_size-1, seq seq_len-1] # Note the following comment in rnn_cell.py: # Note: in many cases it may be more efficient to not use this wrapper, # but instead concatenate the whole sequence of your outputs in time, # do the projection on this batch-concatenated sequence, then split it # if needed or directly feed into a softmax. output = tf.reshape(tf.concat(outputs, axis=1), [-1, args.rnn_size]) # Obtain logits node by applying output weights and biases to the output tensor. # Logits is a tensor of shape [(batch_size * seq_length) x vocab_size]. # Recall that outputs is a 2D tensor of shape [(batch_size * seq_length) x rnn_size], # and softmax_w is a 2D tensor of shape [rnn_size x vocab_size]. # The matrix product is therefore a new 2D tensor of [(batch_size * seq_length) x vocab_size]. # In other words, that multiplication converts a loooong list of rnn_size vectors # to a loooong list of vocab_size vectors. # Then add softmax_b (a single vocab-sized vector) to every row of that list. # That gives you the logits! self.logits = tf.matmul(output, softmax_w) + softmax_b # Convert logits to probabilities. Probs isn't used during training! That node is never calculated. # Like logits, probs is a tensor of shape [(batch_size * seq_length) x vocab_size]. # During sampling, this means it is of shape [1 x vocab_size]. self.probs = tf.nn.softmax(self.logits) # seq2seq.sequence_loss_by_example returns 1D float Tensor containing the log-perplexity # for each sequence. (Size is batch_size * seq_length.) # Targets are reshaped from a [batch_size x seq_length] tensor to a 1D tensor, of the following layout: # target character (batch 0, seq 0) # target character (batch 0, seq 1) # ... # target character (batch 0, seq seq_len-1) # target character (batch 1, seq 0) # ... # These targets are compared to the logits to generate loss. # Logits: instead of a list of character indices, it's a list of character index probability vectors. # seq2seq.sequence_loss_by_example will do the work of generating losses by comparing the one-hot vectors # implicitly represented by the target characters against the probability distrutions in logits. # It returns a 1D float tensor (a vector) where item i is the log-perplexity of # the comparison of the ith logit distribution to the ith one-hot target vector. loss = seq2seq.sequence_loss_by_example( [self.logits], # logits: 1-item list of 2D Tensors of shape [batch_size x vocab_size] [tf.reshape(self.targets, [-1])], # targets: 1-item list of 1D batch-sized int32 Tensors of the same length as logits [tf.ones([args.batch_size * args.seq_length])], # weights: 1-item list of 1D batch-sized float-Tensors of the same length as logits args.vocab_size ) # num_decoder_symbols: integer, number of decoder symbols (output classes) # Cost is the arithmetic mean of the values of the loss tensor # (the sum divided by the total number of elements). # It is a single-element floating point tensor. This is what the optimizer seeks to minimize. self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length # Create a summary for our cost. tf.summary.scalar("cost", self.cost) # Create a node to track the learning rate as it decays through the epochs. self.lr = tf.Variable(args.learning_rate, trainable=False) self.global_epoch_fraction = tf.Variable(0.0, trainable=False) self.global_seconds_elapsed = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables( ) # tvars is a python list of all trainable TF Variable objects. # tf.gradients returns a list of tensors of length len(tvars) where each tensor is sum(dy/dx). grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer( self.lr) # Use ADAM optimizer with the current learning rate. # Zip creates a list of tuples, where each tuple is (variable tensor, gradient tensor). # Training op nudges the variables along the gradient, with the given learning rate, using the ADAM optimizer. # This is the op that a training session should be instructed to perform. self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.summary_op = tf.summary.merge_all()
def model(self, mode="train", num_layers=2, cell_size=32, cell_type="BasicLSTMCell", embedding_size=20, learning_rate=0.0001, tensorboard_verbose=0, checkpoint_path=None): assert mode in ["train", "predict"] checkpoint_path = checkpoint_path or ( "%s%ss2s_checkpoint.tfl" % (self.data_dir or "", "/" if self.data_dir else "")) GO_VALUE = self.out_max_int + 1 # unique integer value used to trigger decoder outputs in the seq2seq RNN network = tflearn.input_data( shape=[None, self.in_seq_len + self.out_seq_len], dtype=tf.int32, name="XY") encoder_inputs = tf.slice(network, [0, 0], [-1, self.in_seq_len], name="enc_in") # get encoder inputs encoder_inputs = tf.unstack( encoder_inputs, axis=1 ) # transform into list of self.in_seq_len elements, each [-1] decoder_inputs = tf.slice(network, [0, self.in_seq_len], [-1, self.out_seq_len], name="dec_in") # get decoder inputs decoder_inputs = tf.unstack( decoder_inputs, axis=1 ) # transform into list of self.out_seq_len elements, each [-1] go_input = tf.multiply( tf.ones_like(decoder_inputs[0], dtype=tf.int32), GO_VALUE ) # insert "GO" symbol as the first decoder input; drop the last decoder input decoder_inputs = [ go_input ] + decoder_inputs[:self.out_seq_len - 1] # insert GO as first; drop last decoder input feed_previous = not (mode == "train") self.n_input_symbols = self.in_max_int + 1 # default is integers from 0 to 9 self.n_output_symbols = self.out_max_int + 2 # extra "GO" symbol for decoder inputs single_cell = getattr(core_rnn_cell, cell_type)(cell_size, state_is_tuple=True) if num_layers == 1: cell = single_cell else: cell = core_rnn_cell.MultiRNNCell([single_cell] * num_layers) if self.seq2seq_model == "embedding_rnn": model_outputs, states = legacy_seq2seq.embedding_rnn_seq2seq( encoder_inputs, # encoder_inputs: A list of 2D Tensors [batch_size, input_size]. decoder_inputs, cell, num_encoder_symbols=self.n_input_symbols, num_decoder_symbols=self.n_output_symbols, embedding_size=embedding_size, feed_previous=feed_previous) elif self.seq2seq_model == "embedding_attention": model_outputs, states = legacy_seq2seq.embedding_attention_seq2seq( encoder_inputs, # encoder_inputs: A list of 2D Tensors [batch_size, input_size]. decoder_inputs, cell, num_encoder_symbols=self.n_input_symbols, num_decoder_symbols=self.n_output_symbols, embedding_size=embedding_size, num_heads=1, initial_state_attention=False, feed_previous=feed_previous) else: raise Exception('[TFLearnSeq2Seq] Unknown seq2seq model %s' % self.seq2seq_model) tf.add_to_collection( tf.GraphKeys.LAYER_VARIABLES + '/' + "seq2seq_model", model_outputs) # for TFLearn to know what to save and restore # model_outputs: list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. network = tf.stack( model_outputs, axis=1 ) # shape [-1, n_decoder_inputs (= self.out_seq_len), num_decoder_symbols] with tf.name_scope( "TargetsData" ): # placeholder for target variable (i.e. trainY input) targetY = tf.placeholder(shape=[None, self.out_seq_len], dtype=tf.int32, name="Y") network = tflearn.regression(network, placeholder=targetY, optimizer='adam', learning_rate=learning_rate, loss=self.sequence_loss, metric=self.accuracy, name="Y") model = tflearn.DNN(network, tensorboard_verbose=tensorboard_verbose, checkpoint_path=checkpoint_path) return model