class LSTMTDNN(Model): """Time-delayed Nueral Network (cf. http://arxiv.org/abs/1508.06615v4) """ def __init__(self, sess, batch_size=100, rnn_size=650, layer_depth=2, word_embed_dim=650, char_embed_dim=15, feature_maps=[50, 100, 150, 200, 200, 200, 200], kernels=[1, 2, 3, 4, 5, 6, 7], seq_length=35, max_word_length=65, use_word=False, use_char=True, hsm=0, max_grad_norm=5, highway_layers=2, dropout_prob=0.5, use_batch_norm=True, checkpoint_dir="checkpoint", forward_only=False, data_dir="data", dataset_name="pdb", use_progressbar=False): """Initialize the parameters for LSTM TDNN Args: rnn_size: the dimensionality of hidden layers layer_depth: # of depth in LSTM layers batch_size: size of batch per epoch word_embed_dim: the dimensionality of word embeddings char_embed_dim: the dimensionality of character embeddings feature_maps: list of feature maps (for each kernel width) kernels: list of kernel widths seq_length: max length of a word use_word: whether to use word embeddings or not use_char: whether to use character embeddings or not highway_layers: # of highway layers to use dropout_prob: the probability of dropout use_batch_norm: whether to use batch normalization or not hsm: whether to use hierarchical softmax """ self.sess = sess self.batch_size = batch_size self.seq_length = seq_length # RNN self.rnn_size = rnn_size self.layer_depth = layer_depth # CNN self.use_word = use_word self.use_char = use_char self.word_embed_dim = word_embed_dim self.char_embed_dim = char_embed_dim self.feature_maps = feature_maps self.kernels = kernels # General self.highway_layers = highway_layers self.dropout_prob = dropout_prob self.use_batch_norm = use_batch_norm # Training self.max_grad_norm = max_grad_norm self.max_word_length = max_word_length self.hsm = hsm self.data_dir = data_dir self.dataset_name = dataset_name self.checkpoint_dir = checkpoint_dir self.forward_only = forward_only self.use_progressbar = use_progressbar self.loader = BatchLoader(self.data_dir, self.dataset_name, self.batch_size, self.seq_length, self.max_word_length) print('Word vocab size: %d, Char vocab size: %d, Max word length (incl. padding): %d' % \ (len(self.loader.idx2word), len(self.loader.idx2char), self.loader.max_word_length)) self.max_word_length = self.loader.max_word_length self.char_vocab_size = len(self.loader.idx2char) self.word_vocab_size = len(self.loader.idx2word) # build LSTMTDNN model self.prepare_model() # load checkpoints if self.forward_only == True: if self.load(self.checkpoint_dir, self.dataset_name): print(" [*] SUCCESS to load model for %s." % self.dataset_name) else: print(" [!] Failed to load model for %s." % self.dataset_name) sys.exit(1) def prepare_model(self): with tf.variable_scope("LSTMTDNN"): self.char_inputs = [] self.word_inputs = [] self.cnn_outputs = [] if self.use_char: char_W = tf.get_variable( "char_embed", [self.char_vocab_size, self.char_embed_dim]) else: word_W = tf.get_variable( "word_embed", [self.word_vocab_size, self.word_embed_dim]) with tf.variable_scope("CNN") as scope: self.char_inputs = tf.placeholder( tf.int32, [self.batch_size, self.seq_length, self.max_word_length]) self.word_inputs = tf.placeholder( tf.int32, [self.batch_size, self.seq_length]) char_indices = tf.split(1, self.seq_length, self.char_inputs) word_indices = tf.split(1, self.seq_length, tf.expand_dims(self.word_inputs, -1)) for idx in xrange(self.seq_length): char_index = tf.reshape(char_indices[idx], [-1, self.max_word_length]) word_index = tf.reshape(word_indices[idx], [-1, 1]) if idx != 0: scope.reuse_variables() if self.use_char: # [batch_size x word_max_length, char_embed] char_embed = tf.nn.embedding_lookup(char_W, char_index) char_cnn = TDNN(char_embed, self.char_embed_dim, self.feature_maps, self.kernels) if self.use_word: word_embed = tf.nn.embedding_lookup( word_W, word_index) cnn_output = tf.concat(1, char_cnn.output, word_embed) else: cnn_output = char_cnn.output else: cnn_output = tf.squeeze( tf.nn.embedding_lookup(word_W, word_index)) if self.use_batch_norm: bn = batch_norm() norm_output = bn( tf.expand_dims(tf.expand_dims(cnn_output, 1), 1)) cnn_output = tf.squeeze(norm_output) if highway: #cnn_output = highway(input_, input_dim_length, self.highway_layers, 0) cnn_output = highway(cnn_output, cnn_output.get_shape()[1], self.highway_layers, 0) self.cnn_outputs.append(cnn_output) with tf.variable_scope("LSTM") as scope: self.cell = rnn_cell.BasicLSTMCell(self.rnn_size) self.stacked_cell = rnn_cell.MultiRNNCell([self.cell] * self.layer_depth) outputs, _ = rnn.rnn(self.stacked_cell, self.cnn_outputs, dtype=tf.float32) self.lstm_outputs = [] self.true_outputs = tf.placeholder( tf.float32, [self.batch_size, self.seq_length, self.word_vocab_size]) loss = 0 true_outputs = tf.split(1, self.seq_length, self.true_outputs) for idx, (top_h, true_output) in enumerate(zip(outputs, true_outputs)): if self.dropout_prob > 0: top_h = tf.nn.dropout(top_h, self.dropout_prob) if self.hsm > 0: self.lstm_outputs.append(top_h) else: if idx != 0: scope.reuse_variables() proj = rnn_cell.linear(top_h, self.word_vocab_size, 0) log_softmax = tf.log(tf.nn.softmax(proj)) self.lstm_outputs.append(log_softmax) loss += tf.nn.softmax_cross_entropy_with_logits( self.lstm_outputs[idx], tf.squeeze(true_output)) self.loss = tf.reduce_mean(loss) / self.seq_length tf.scalar_summary("loss", self.loss) tf.scalar_summary("perplexity", tf.exp(self.loss)) def train(self, epoch): cost = 0 target = np.zeros( [self.batch_size, self.seq_length, self.word_vocab_size]) N = self.loader.sizes[0] for idx in xrange(N): target.fill(0) x, y, x_char = self.loader.next_batch(0) for b in xrange(self.batch_size): for t, w in enumerate(y[b]): target[b][t][w] = 1 feed_dict = { self.word_inputs: x, self.char_inputs: x_char, self.true_outputs: target, } _, loss, step, summary_str = self.sess.run( [self.optim, self.loss, self.global_step, self.merged_summary], feed_dict=feed_dict) self.writer.add_summary(summary_str, step) if idx % 50 == 0: if self.use_progressbar: progress( idx / N, "epoch: [%2d] [%4d/%4d] loss: %2.6f" % (epoch, idx, N, loss)) else: print("epoch: [%2d] [%4d/%4d] loss: %2.6f" % (epoch, idx, N, loss)) cost += loss return cost / N def test(self, split_idx, max_batches=None): if split_idx == 1: set_name = 'Valid' else: set_name = 'Test' N = self.loader.sizes[split_idx] if max_batches != None: N = min(max_batches, N) self.loader.reset_batch_pointer(split_idx) target = np.zeros( [self.batch_size, self.seq_length, self.word_vocab_size]) cost = 0 for idx in xrange(N): target.fill(0) x, y, x_char = self.loader.next_batch(split_idx) for b in xrange(self.batch_size): for t, w in enumerate(y[b]): target[b][t][w] = 1 feed_dict = { self.word_inputs: x, self.char_inputs: x_char, self.true_outputs: target, } loss = self.sess.run(self.loss, feed_dict=feed_dict) if idx % 50 == 0: if self.use_progressbar: progress( idx / N, "> %s: loss: %2.6f, perplexity: %2.6f" % (set_name, loss, np.exp(loss))) else: print(" > %s: loss: %2.6f, perplexity: %2.6f" % (set_name, loss, np.exp(loss))) cost += loss cost = cost / N return cost def run(self, epoch=25, learning_rate=1, learning_rate_decay=0.5): self.current_lr = learning_rate self.lr = tf.Variable(learning_rate, trainable=False) self.opt = tf.train.GradientDescentOptimizer(self.lr) #self.opt = tf.train.AdamOptimizer(learning_rate, beta1=0.5).minimize(self.loss) # clip gradients params = tf.trainable_variables() grads = [] for grad in tf.gradients(self.loss, params): if grad: grads.append(tf.clip_by_norm(grad, self.max_grad_norm)) else: grads.append(grad) self.global_step = tf.Variable(0, name="global_step", trainable=False) self.optim = self.opt.apply_gradients(zip(grads, params), global_step=self.global_step) # ready for train tf.initialize_all_variables().run() if self.load(self.checkpoint_dir, self.dataset_name): print(" [*] SUCCESS to load model for %s." % self.dataset_name) else: print(" [!] Failed to load model for %s." % self.dataset_name) self.saver = tf.train.Saver() self.merged_summary = tf.merge_all_summaries() self.writer = tf.train.SummaryWriter("./logs", self.sess.graph_def) self.log_loss = [] self.log_perp = [] if not self.forward_only: for idx in xrange(epoch): train_loss = self.train(idx) valid_loss = self.test(1) # Logging self.log_loss.append([train_loss, valid_loss]) self.log_perp.append([np.exp(train_loss), np.exp(valid_loss)]) state = { 'perplexity': np.exp(train_loss), 'epoch': idx, 'learning_rate': self.current_lr, 'valid_perplexity': np.exp(valid_loss) } print(state) # Learning rate annealing if len(self.log_loss) > 1 and self.log_loss[idx][ 1] > self.log_loss[idx - 1][1] * 0.9999: self.current_lr = self.current_lr * learning_rate_decay self.lr.assign(self.current_lr).eval() if self.current_lr < 1e-5: break if idx % 2 == 0: self.save(self.checkpoint_dir, self.dataset_name) test_loss = self.test(2) print(" [*] Test loss: %2.6f, perplexity: %2.6f" % (test_loss, np.exp(test_loss)))
class LSTMTDNN(Model): """ Time-delayed Neural Network (cf. http://arxiv.org/abs/1508.06615v4) """ def __init__(self, sess, batch_size=100, rnn_size=650, layer_depth=2, word_embed_dim=650, char_embed_dim=15, feature_maps=[50, 100, 150, 200, 200, 200, 200], kernels=[1,2,3,4,5,6,7], seq_length=35, max_word_length=65, use_word=False, use_char=True, hsm=0, max_grad_norm=5, highway_layers=2, dropout_prob=0.5, use_batch_norm=True, checkpoint_dir="checkpoint", forward_only=False, data_dir="data", dataset_name="pdb", use_progressbar=False): """ Initialize the parameters for LSTM TDNN Args: rnn_size: the dimensionality of hidden layers layer_depth: # of depth in LSTM layers batch_size: size of batch per epoch word_embed_dim: the dimensionality of word embeddings char_embed_dim: the dimensionality of character embeddings feature_maps: list of feature maps (for each kernel width) kernels: list of kernel widths seq_length: max length of a word use_word: whether to use word embeddings or not use_char: whether to use character embeddings or not highway_layers: # of highway layers to use dropout_prob: the probability of dropout use_batch_norm: whether to use batch normalization or not hsm: whether to use hierarchical softmax """ self.sess = sess self.batch_size = batch_size self.seq_length = seq_length # RNN self.rnn_size = rnn_size self.layer_depth = layer_depth # CNN self.use_word = use_word self.use_char = use_char self.word_embed_dim = word_embed_dim self.char_embed_dim = char_embed_dim self.feature_maps = feature_maps self.kernels = kernels # General self.highway_layers = highway_layers self.dropout_prob = dropout_prob self.use_batch_norm = use_batch_norm # Training self.max_grad_norm = max_grad_norm self.max_word_length = max_word_length self.hsm = hsm self.data_dir = data_dir self.dataset_name = dataset_name self.checkpoint_dir = checkpoint_dir self.forward_only = forward_only self.use_progressbar = use_progressbar self.loader = BatchLoader(self.data_dir, self.dataset_name, self.batch_size, self.seq_length, self.max_word_length) print('Word vocab size: %d, Char vocab size: %d, Max word length (incl. padding): %d' % \ (len(self.loader.idx2word), len(self.loader.idx2char), self.loader.max_word_length)) self.max_word_length = self.loader.max_word_length self.char_vocab_size = len(self.loader.idx2char) self.word_vocab_size = len(self.loader.idx2word) # build LSTMTDNN model self.prepare_model() # load checkpoints if self.forward_only == True: if self.load(self.checkpoint_dir, self.dataset_name): print("[*] SUCCESS to load model for %s." % self.dataset_name) else: print("[!] Failed to load model for %s." % self.dataset_name) sys.exit(1) def prepare_model(self): with tf.variable_scope("LSTMTDNN"): self.char_inputs = [] self.word_inputs = [] self.cnn_outputs = [] if self.use_char: char_W = tf.get_variable("char_embed", [self.char_vocab_size, self.char_embed_dim]) if self.use_word: word_W = tf.get_variable("word_embed", [self.word_vocab_size, self.word_embed_dim]) with tf.variable_scope("CNN") as scope: self.char_inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_length, self.max_word_length]) self.word_inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_length]) char_indices = tf.split(1, self.seq_length, self.char_inputs) word_indices = tf.split(1, self.seq_length, tf.expand_dims(self.word_inputs, -1)) for idx in xrange(self.seq_length): char_index = tf.reshape(char_indices[idx], [-1, self.max_word_length]) word_index = tf.reshape(word_indices[idx], [-1, 1]) if idx != 0: scope.reuse_variables() if self.use_char: # [batch_size x word_max_length, char_embed] char_embed = tf.nn.embedding_lookup(char_W, char_index) char_cnn = TDNN(char_embed, self.char_embed_dim, self.feature_maps, self.kernels) if self.use_word: word_embed = tf.nn.embedding_lookup(word_W, word_index) cnn_output = tf.concat(1, [char_cnn.output, tf.squeeze(word_embed, [1])]) else: cnn_output = char_cnn.output else: cnn_output = tf.squeeze(tf.nn.embedding_lookup(word_W, word_index)) if self.use_batch_norm: bn = batch_norm() norm_output = bn(tf.expand_dims(tf.expand_dims(cnn_output, 1), 1)) cnn_output = tf.squeeze(norm_output) if highway: #cnn_output = highway(input_, input_dim_length, self.highway_layers, 0) cnn_output = highway(cnn_output, cnn_output.get_shape()[1], self.highway_layers, 0) self.cnn_outputs.append(cnn_output) with tf.variable_scope("LSTM") as scope: self.cell = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size) self.stacked_cell = tf.nn.rnn_cell.MultiRNNCell([self.cell] * self.layer_depth) outputs, _ = tf.nn.rnn(self.stacked_cell, self.cnn_outputs, dtype=tf.float32) self.lstm_outputs = [] self.true_outputs = tf.placeholder(tf.int64, [self.batch_size, self.seq_length]) loss = 0 true_outputs = tf.split(1, self.seq_length, self.true_outputs) for idx, (top_h, true_output) in enumerate(zip(outputs, true_outputs)): if self.dropout_prob > 0: top_h = tf.nn.dropout(top_h, self.dropout_prob) if self.hsm > 0: self.lstm_outputs.append(top_h) else: if idx != 0: scope.reuse_variables() proj = tf.nn.rnn_cell._linear(top_h, self.word_vocab_size, 0) self.lstm_outputs.append(proj) loss += tf.nn.sparse_softmax_cross_entropy_with_logits(self.lstm_outputs[idx], tf.squeeze(true_output)) self.loss = tf.reduce_mean(loss) / self.seq_length tf.scalar_summary("loss", self.loss) tf.scalar_summary("perplexity", tf.exp(self.loss)) def train(self, epoch): cost = 0 target = np.zeros([self.batch_size, self.seq_length]) N = self.loader.sizes[0] for idx in xrange(N): target.fill(0) x, y, x_char = self.loader.next_batch(0) for b in xrange(self.batch_size): for t, w in enumerate(y[b]): target[b][t] = w feed_dict = { self.word_inputs: x, self.char_inputs: x_char, self.true_outputs: target, } _, loss, step, summary_str = self.sess.run( [self.optim, self.loss, self.global_step, self.merged_summary], feed_dict=feed_dict) self.writer.add_summary(summary_str, step) if idx % 50 == 0: if self.use_progressbar: progress(idx/N, "epoch: [%2d] [%4d/%4d] loss: %2.6f" % (epoch, idx, N, loss)) else: print("epoch: [%2d] [%4d/%4d] loss: %2.6f" % (epoch, idx, N, loss)) cost += loss return cost / N def test(self, split_idx, max_batches=None): if split_idx == 1: set_name = 'Valid' else: set_name = 'Test' N = self.loader.sizes[split_idx] if max_batches != None: N = min(max_batches, N) self.loader.reset_batch_pointer(split_idx) target = np.zeros([self.batch_size, self.seq_length]) cost = 0 for idx in xrange(N): target.fill(0) x, y, x_char = self.loader.next_batch(split_idx) for b in xrange(self.batch_size): for t, w in enumerate(y[b]): target[b][t] = w feed_dict = { self.word_inputs: x, self.char_inputs: x_char, self.true_outputs: target, } loss = self.sess.run(self.loss, feed_dict=feed_dict) if idx % 50 == 0: if self.use_progressbar: progress(idx/N, "> %s: loss: %2.6f, perplexity: %2.6f" % (set_name, loss, np.exp(loss))) else: print(" > %s: loss: %2.6f, perplexity: %2.6f" % (set_name, loss, np.exp(loss))) cost += loss cost = cost / N return cost def run(self, epoch=25, learning_rate=1, learning_rate_decay=0.5): self.current_lr = learning_rate self.lr = tf.Variable(learning_rate, trainable=False) self.opt = tf.train.GradientDescentOptimizer(self.lr) #self.opt = tf.train.AdamOptimizer(learning_rate, beta1=0.5).minimize(self.loss) # clip gradients params = tf.trainable_variables() grads = [] for grad in tf.gradients(self.loss, params): if grad is not None: grads.append(tf.clip_by_norm(grad, self.max_grad_norm)) else: grads.append(grad) self.global_step = tf.Variable(0, name="global_step", trainable=False) self.optim = self.opt.apply_gradients(zip(grads, params), global_step=self.global_step) # ready for train tf.initialize_all_variables().run() if self.load(self.checkpoint_dir, self.dataset_name): print("[*] SUCCESS to load model for %s." % self.dataset_name) else: print("[!] Failed to load model for %s." % self.dataset_name) self.saver = tf.train.Saver() self.merged_summary = tf.merge_all_summaries() self.writer = tf.train.SummaryWriter("./logs", self.sess.graph_def) self.log_loss = [] self.log_perp = [] if not self.forward_only: for idx in xrange(epoch): train_loss = self.train(idx) valid_loss = self.test(1) # Logging self.log_loss.append([train_loss, valid_loss]) self.log_perp.append([np.exp(train_loss), np.exp(valid_loss)]) state = { 'perplexity': np.exp(train_loss), 'epoch': idx, 'learning_rate': self.current_lr, 'valid_perplexity': np.exp(valid_loss) } print(state) # Learning rate annealing if len(self.log_loss) > 1 and self.log_loss[idx][1] > self.log_loss[idx-1][1] * 0.9999: self.current_lr = self.current_lr * learning_rate_decay self.lr.assign(self.current_lr).eval() if self.current_lr < 1e-5: break if idx % 2 == 0: self.save(self.checkpoint_dir, self.dataset_name) test_loss = self.test(2) print("[*] Test loss: %2.6f, perplexity: %2.6f" % (test_loss, np.exp(test_loss)))