def __init__(self, config, word_ix_to_pat_ixs, need_reuse=False): # get hyperparameters batch_size = config.batch_size num_steps = config.num_steps max_word_len = config.max_word_len pat_emb_dim = config.pat_emb_dim highway_size = config.highway_size init_scale = config.init_scale num_sampled = config.num_sampled pat_vocab_size = config.pat_vocab_size hidden_size = config.hidden_size num_layers = config.num_layers word_vocab_size = config.word_vocab_size drop_x = config.drop_x drop_i = config.drop_i drop_h = config.drop_h drop_o = config.drop_o weight_decay = config.weight_decay # pattern embedding matrix with tf.variable_scope('pat_emb', reuse=need_reuse): self.pat_embedding = tf.get_variable( "pat_embedding", [pat_vocab_size, pat_emb_dim], dtype=tf.float32, initializer=tf.random_uniform_initializer( -init_scale, init_scale)) # placeholders for training data and labels self.x = tf.placeholder(tf.int32, [batch_size, num_steps, max_word_len]) self.y = tf.placeholder(tf.int32, [batch_size, num_steps]) y_float = tf.cast(self.y, tf.float32) # we first embed patterns ... words_embedded = tf.nn.embedding_lookup(self.pat_embedding, self.x) words_embedded = tf.reshape(words_embedded, [-1, max_word_len, pat_emb_dim]) # ... and then sum pattern vectors to get a word vector words_embedded_sum = tf.reduce_sum(words_embedded, axis=1) # we feed the word vector into a stack of two HW layers ... def highway_layer(highway_inputs): transf_weights = tf.get_variable( 'transf_weights', [highway_size, highway_size], initializer=tf.random_uniform_initializer( -init_scale, init_scale), dtype=tf.float32) transf_biases = tf.get_variable( 'transf_biases', [highway_size], initializer=tf.random_uniform_initializer( -2 - 0.01, -2 + 0.01), dtype=tf.float32) highw_weights = tf.get_variable( 'highw_weights', [highway_size, highway_size], initializer=tf.random_uniform_initializer( -init_scale, init_scale), dtype=tf.float32) highw_biases = tf.get_variable( 'highw_biases', [highway_size], initializer=tf.random_uniform_initializer( -init_scale, init_scale), dtype=tf.float32) transf_gate = tf.nn.sigmoid( tf.matmul(highway_inputs, transf_weights) + transf_biases) highw_output = tf.multiply(transf_gate, tf.nn.relu(tf.matmul(highway_inputs, highw_weights) + highw_biases)) \ + tf.multiply(tf.ones([highway_size], dtype=tf.float32) - transf_gate, highway_inputs) return highw_output, transf_gate with tf.variable_scope('highway1', reuse=need_reuse): highw1_output, self.t1 = highway_layer(words_embedded_sum) with tf.variable_scope('highway2', reuse=need_reuse): highw2_output, self.t2 = highway_layer(highw1_output) highw_output_reshaped = tf.reshape(highw2_output, [batch_size, num_steps, -1]) if not need_reuse: highw_output_reshaped = tf.nn.dropout(highw_output_reshaped, 1 - drop_x, [batch_size, num_steps, 1]) # ... and then process it with a stack of two LSTMs lstm_input = tf.unstack(highw_output_reshaped, axis=1) # basic LSTM cell def lstm_cell(): return tf.contrib.rnn.LSTMCell(hidden_size, forget_bias=1.0, reuse=need_reuse) cells = [] for i in range(num_layers): with tf.variable_scope('layer' + str(i)): if not need_reuse: if i == 0: cells.append( my_dropout.MyDropoutWrapper( lstm_cell(), input_keep_prob=1 - drop_i, state_keep_prob=1 - drop_h, output_keep_prob=1 - drop_o, variational_recurrent=True, input_size=highway_size, dtype=tf.float32)) else: cells.append( my_dropout.MyDropoutWrapper( lstm_cell(), state_keep_prob=1 - drop_h, output_keep_prob=1 - drop_o, variational_recurrent=True, input_size=hidden_size, dtype=tf.float32)) else: cells.append(lstm_cell()) self.cell = tf.contrib.rnn.MultiRNNCell(cells) self.init_state = self.cell.zero_state(batch_size, dtype=tf.float32) with tf.variable_scope('lstm_rnn', reuse=need_reuse): outputs, self.state = tf.contrib.rnn.static_rnn( self.cell, lstm_input, dtype=tf.float32, initial_state=self.init_state) output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, hidden_size]) # finally we predict the next word according to a softmax normalization with tf.variable_scope('softmax_params', reuse=need_reuse): weights = tf.get_variable( 'weights', [word_vocab_size, hidden_size], initializer=tf.random_uniform_initializer( -init_scale, init_scale), dtype=tf.float32) biases = tf.get_variable('biases', [word_vocab_size], initializer=tf.random_uniform_initializer( -init_scale, init_scale), dtype=tf.float32) # and compute the cross-entropy between labels and predictions logits = tf.matmul(output, tf.transpose(weights)) + biases loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self.y, [-1])], [tf.ones([batch_size * num_steps], dtype=tf.float32)]) self.cost = tf.reduce_sum(loss) / batch_size if not need_reuse: tvars = tf.trainable_variables() l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tvars if 'bias' not in v.name and 'Bias' not in v.name ]) * weight_decay self.full_cost = self.cost + l2_loss
def __init__(self, config, word_ix_to_morph_ixs, need_reuse=False): # get hyperparameters batch_size = config.batch_size num_steps = config.num_steps self.max_word_len = max_word_len = config.max_word_len self.morph_emb_dim = morph_emb_dim = config.morph_emb_dim self.highway_size = highway_size = config.highway_size self.init_scale = init_scale = config.init_scale num_sampled = config.num_sampled morph_vocab_size = config.morph_vocab_size hidden_size = config.hidden_size num_layers = config.num_layers word_vocab_size = config.word_vocab_size drop_x = config.drop_x drop_i = config.drop_i drop_h = config.drop_h drop_o = config.drop_o # morpheme embedding matrix with tf.variable_scope('morph_emb', reuse=need_reuse): self.morph_embedding = tf.get_variable( "morph_embedding", [morph_vocab_size, morph_emb_dim], dtype=tf.float32) # placeholders for training data and labels self.x = tf.placeholder(tf.int32, [batch_size, num_steps, max_word_len]) self.y = tf.placeholder(tf.int32, [batch_size, num_steps]) y_float = tf.cast(self.y, tf.float32) # we first embed morphemes ... words_embedded = tf.nn.embedding_lookup(self.morph_embedding, self.x) words_emb_as_list = tf.unstack(words_embedded, axis=1) words_list = [] for word_emb in words_emb_as_list: #summing up morpheme embeddings morph_sum = tf.reduce_sum(word_emb, axis=1) words_list.append(morph_sum) words_packed_reshaped = tf.reshape(tf.stack(words_list, axis=1), [-1, morph_emb_dim]) # we project word vectors to match the dimensionality of # the highway layer if morph_emb_dim != highway_size: with tf.variable_scope('projection', reuse=need_reuse): proj_w = tf.get_variable('proj_w', [morph_emb_dim, highway_size], dtype=tf.float32) words_packed_reshaped_proj = tf.matmul(words_packed_reshaped, proj_w) else: words_packed_reshaped_proj = words_packed_reshaped # we feed the word vector into a stack of two HW layers ... with tf.variable_scope('highway1', reuse=need_reuse): highw1_output = self.highway_layer(words_packed_reshaped_proj) with tf.variable_scope('highway2', reuse=need_reuse): highw2_output = self.highway_layer(highw1_output) highw_output_reshaped = tf.reshape(highw2_output, [batch_size, num_steps, -1]) if not need_reuse: highw_output_reshaped = tf.nn.dropout(highw_output_reshaped, 1 - drop_x, [batch_size, num_steps, 1]) # ... and then process it with a stack of two LSTMs lstm_input = tf.unstack(highw_output_reshaped, axis=1) # basic LSTM cell def lstm_cell(): return tf.nn.rnn_cell.LSTMCell(hidden_size, forget_bias=1.0, reuse=need_reuse) cells = [] for i in range(num_layers): with tf.variable_scope('layer' + str(i)): if not need_reuse: if i == 0: cells.append( my_dropout.MyDropoutWrapper( lstm_cell(), input_keep_prob=1 - drop_i, state_keep_prob=1 - drop_h, output_keep_prob=1 - drop_o, variational_recurrent=True, input_size=highway_size, dtype=tf.float32)) else: cells.append( my_dropout.MyDropoutWrapper( lstm_cell(), state_keep_prob=1 - drop_h, output_keep_prob=1 - drop_o, variational_recurrent=True, input_size=hidden_size, dtype=tf.float32)) else: cells.append(lstm_cell()) self.cell = tf.nn.rnn_cell.MultiRNNCell(cells) self.init_state = self.cell.zero_state(batch_size, dtype=tf.float32) with tf.variable_scope('lstm_rnn', reuse=need_reuse): outputs, self.state = tf.contrib.rnn.static_rnn( self.cell, lstm_input, dtype=tf.float32, initial_state=self.init_state) output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, hidden_size]) # finally we predict the next word according to a softmax normalization if config.reuse_emb: self.morph_embedding_out = self.morph_embedding with tf.variable_scope('softmax_params', reuse=need_reuse): if morph_emb_dim != highway_size: proj_w_out = tf.get_variable('proj_w_out', [morph_emb_dim, highway_size], dtype=tf.float32) if highway_size != hidden_size: proj2_w_out = tf.get_variable('proj2_w_out', [highway_size, hidden_size], dtype=tf.float32) if not config.reuse_emb: self.morph_embedding_out = tf.get_variable( "morph_embedding_out", [morph_vocab_size, morph_emb_dim], dtype=tf.float32) biases = tf.get_variable('biases', [word_vocab_size], dtype=tf.float32) if config.ssm == 1 and not need_reuse: def _sum_rows(x): """Returns a vector summing up each row of the matrix x.""" # _sum_rows(x) is equivalent to math_ops.reduce_sum(x, 1) when x is # a matrix. The gradient of _sum_rows(x) is more efficient than # reduce_sum(x, 1)'s gradient in today's implementation. Therefore, # we use _sum_rows(x) in the nce_loss() computation since the loss # is mostly used for training. cols = tf.shape(x)[1] ones_shape = tf.stack([cols, 1]) ones = tf.ones(ones_shape, x.dtype) return tf.reshape(tf.matmul(x, ones), [-1]) labels = tf.cast(tf.reshape(self.y, [-1, 1]), tf.int64) labels_flat = tf.reshape(labels, [-1]) # Sample the negative labels. # sampled shape: [num_sampled] tensor # true_expected_count shape = [batch_size, 1] tensor # sampled_expected_count shape = [num_sampled] tensor sampled_values = tf.nn.log_uniform_candidate_sampler( true_classes=labels, num_true=1, num_sampled=num_sampled, unique=True, range_max=word_vocab_size) sampled, true_expected_count, sampled_expected_count = ( tf.stop_gradient(s) for s in sampled_values) sampled = tf.cast(sampled, tf.int64) # labels_flat is a [batch_size * num_steps] tensor # sampled is a [num_sampled] int tensor all_ids = tf.concat([labels_flat, sampled], 0) words_in_morphs = tf.nn.embedding_lookup(word_ix_to_morph_ixs, all_ids) words_in_morphs_embedded = tf.nn.embedding_lookup( self.morph_embedding_out, words_in_morphs) all_w_full = tf.reduce_sum(words_in_morphs_embedded, axis=1) if morph_emb_dim != highway_size: all_w_proj = tf.matmul(all_w_full, proj_w_out) else: all_w_proj = all_w_full with tf.variable_scope( 'highway1' if config.reuse_hw1 else 'highway1_out', reuse=config.reuse_hw1 or need_reuse): all_w_hw1_output = self.highway_layer(all_w_proj) with tf.variable_scope( 'highway2' if config.reuse_hw2 else 'highway2_out', reuse=config.reuse_hw2 or need_reuse): all_w_hw2_output = self.highway_layer(all_w_hw1_output) if highway_size != hidden_size: all_w = tf.matmul(all_w_hw2_output, proj2_w_out) else: all_w = all_w_hw2_output # true_w shape is [batch_size * num_true, dim] true_w = tf.slice(all_w, [0, 0], tf.stack([tf.shape(labels_flat)[0], -1])) sampled_w = tf.slice(all_w, tf.stack([tf.shape(labels_flat)[0], 0]), [-1, -1]) sampled_logits = tf.matmul(output, sampled_w, transpose_b=True) all_b = tf.nn.embedding_lookup(biases, all_ids) # true_b is a [batch_size * num_true] tensor # sampled_b is a [num_sampled] float tensor true_b = tf.slice(all_b, [0], tf.shape(labels_flat)) sampled_b = tf.slice(all_b, tf.shape(labels_flat), [-1]) dim = tf.shape(true_w)[1:2] new_true_w_shape = tf.concat([[-1, 1], dim], 0) row_wise_dots = tf.multiply(tf.expand_dims(output, 1), tf.reshape(true_w, new_true_w_shape)) # We want the row-wise dot plus biases which yields a # [batch_size, num_true] tensor of true_logits. dots_as_matrix = tf.reshape(row_wise_dots, tf.concat([[-1], dim], 0)) true_logits = tf.reshape(_sum_rows(dots_as_matrix), [-1, 1]) true_b = tf.reshape(true_b, [-1, 1]) true_logits += true_b sampled_logits += sampled_b true_logits -= tf.log(true_expected_count) sampled_logits -= tf.log(sampled_expected_count) # Construct output logits and labels. The true labels/logits start at col 0. out_logits = tf.concat([true_logits, sampled_logits], 1) # true_logits is a float tensor, ones_like(true_logits) is a float tensor # of ones. We then divide by num_true to ensure the per-example labels sum # to 1.0, i.e. form a proper probability distribution. out_labels = tf.concat( [tf.ones_like(true_logits), tf.zeros_like(sampled_logits)], 1) loss = tf.nn.softmax_cross_entropy_with_logits(labels=out_labels, logits=out_logits) else: words_in_morphs_embedded = tf.nn.embedding_lookup( self.morph_embedding_out, word_ix_to_morph_ixs) weights_full = tf.reduce_sum(words_in_morphs_embedded, axis=1) if morph_emb_dim != highway_size: weights_proj = tf.matmul(weights_full, proj_w_out) else: weights_proj = weights_full with tf.variable_scope( 'highway1' if config.reuse_hw1 else 'highway1_out', reuse=config.reuse_hw1 or need_reuse): weights_highw1_output = self.highway_layer(weights_proj) with tf.variable_scope( 'highway2' if config.reuse_hw2 else 'highway2_out', reuse=config.reuse_hw2 or need_reuse): weights_highw2_output = self.highway_layer( weights_highw1_output) if highway_size != hidden_size: weights = tf.matmul(weights_highw2_output, proj2_w_out) else: weights = weights_highw2_output logits = tf.matmul(output, tf.transpose(weights)) + biases loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self.y, [-1])], [tf.ones([batch_size * num_steps], dtype=tf.float32)]) self.cost = tf.reduce_sum(loss) / batch_size
def __init__(self, config, word_ix_to_syl_ixs, need_reuse=False): # get hyperparameters batch_size = config.batch_size num_steps = config.num_steps self.max_word_len = max_word_len = config.max_word_len self.syl_emb_dim = syl_emb_dim = config.syl_emb_dim self.highway_size = highway_size = config.highway_size self.init_scale = init_scale = config.init_scale num_sampled = config.num_sampled syl_vocab_size = config.syl_vocab_size hidden_size = config.hidden_size num_layers = config.num_layers word_vocab_size = config.word_vocab_size drop_x = config.drop_x drop_i = config.drop_i drop_h = config.drop_h drop_o = config.drop_o # syllable embedding matrix with tf.variable_scope('syl_emb', reuse=need_reuse): self.syl_embedding = tf.get_variable("syl_embedding", [syl_vocab_size, syl_emb_dim], dtype=tf.float32) # placeholders for training data and labels self.x = tf.placeholder(tf.int32, [batch_size, num_steps, max_word_len]) self.y = tf.placeholder(tf.int32, [batch_size, num_steps]) y_float = tf.cast(self.y, tf.float32) # we first embed sylemes ... words_embedded = tf.nn.embedding_lookup(self.syl_embedding, self.x) words_embedded_unrolled = tf.unstack(words_embedded, axis=1) # ... and then concatenate them to obtain word vectors words_list = [] for word in words_embedded_unrolled: syls = tf.unstack(word, axis=1) syls_concat = tf.concat(axis=1, values=syls) words_list.append(syls_concat) words_packed_reshaped = tf.reshape(tf.stack(words_list, axis=1), [-1, max_word_len * syl_emb_dim]) # we project word vectors to match the dimensionality of # the highway layer with tf.variable_scope('projection', reuse=need_reuse): proj_w = tf.get_variable( 'proj_w', [max_word_len * syl_emb_dim, highway_size], dtype=tf.float32) words_packed_reshaped_proj = tf.matmul(words_packed_reshaped, proj_w) # we feed the word vector into a stack of two HW layers ... with tf.variable_scope('highway1', reuse=need_reuse): highw1_output = self.highway_layer(words_packed_reshaped_proj) with tf.variable_scope('highway2', reuse=need_reuse): highw2_output = self.highway_layer(highw1_output) highw_output_reshaped = tf.reshape(highw2_output, [batch_size, num_steps, -1]) if not need_reuse: highw_output_reshaped = tf.nn.dropout(highw_output_reshaped, 1 - drop_x, [batch_size, num_steps, 1]) # ... and then process it with a stack of two LSTMs lstm_input = tf.unstack(highw_output_reshaped, axis=1) # basic LSTM cell def lstm_cell(): return tf.nn.rnn_cell.LSTMCell(hidden_size, forget_bias=1.0, reuse=need_reuse) cells = [] for i in range(num_layers): with tf.variable_scope('layer' + str(i)): if not need_reuse: if i == 0: cells.append( my_dropout.MyDropoutWrapper( lstm_cell(), input_keep_prob=1 - drop_i, state_keep_prob=1 - drop_h, output_keep_prob=1 - drop_o, variational_recurrent=True, input_size=highway_size, dtype=tf.float32)) else: cells.append( my_dropout.MyDropoutWrapper( lstm_cell(), state_keep_prob=1 - drop_h, output_keep_prob=1 - drop_o, variational_recurrent=True, input_size=hidden_size, dtype=tf.float32)) else: cells.append(lstm_cell()) self.cell = tf.nn.rnn_cell.MultiRNNCell(cells) self.init_state = self.cell.zero_state(batch_size, dtype=tf.float32) with tf.variable_scope('lstm_rnn', reuse=need_reuse): outputs, self.state = tf.contrib.rnn.static_rnn( self.cell, lstm_input, dtype=tf.float32, initial_state=self.init_state) output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, hidden_size]) # finally we predict the next word according to a softmax normalization if config.reuse_emb: self.syl_embedding_out = self.syl_embedding proj_w_out = proj_w with tf.variable_scope('softmax_params', reuse=need_reuse): if highway_size != hidden_size: proj2_w_out = tf.get_variable('proj2_w_out', [highway_size, hidden_size], dtype=tf.float32) if not config.reuse_emb: self.syl_embedding_out = tf.get_variable( "syl_embedding_out", [syl_vocab_size, syl_emb_dim], dtype=tf.float32) proj_w_out = tf.get_variable( 'proj_w_out', [max_word_len * syl_emb_dim, highway_size], dtype=tf.float32) biases = tf.get_variable('biases', [word_vocab_size], dtype=tf.float32) words_in_syls = [] for word_ix in range(word_vocab_size): words_in_syls.append(word_ix_to_syl_ixs[word_ix]) words_in_syls_embedded = tf.nn.embedding_lookup( self.syl_embedding_out, words_in_syls) syls = tf.unstack(words_in_syls_embedded, axis=1) weights_full = tf.concat(syls, axis=1) weights_proj = tf.matmul(weights_full, proj_w_out) with tf.variable_scope( 'highway1' if config.reuse_hw1 else 'highway1_out', reuse=config.reuse_hw1 or need_reuse): weights_highw1_output = self.highway_layer(weights_proj) with tf.variable_scope( 'highway2' if config.reuse_hw2 else 'highway2_out', reuse=config.reuse_hw2 or need_reuse): weights_highw2_output = self.highway_layer(weights_highw1_output) if highway_size != hidden_size: weights = tf.matmul(weights_highw2_output, proj2_w_out) else: weights = weights_highw2_output # and compute the cross-entropy between labels and predictions if config.ssm == 1 and not need_reuse: loss = tf.nn.sampled_softmax_loss(weights, biases, tf.reshape(y_float, [-1, 1]), output, num_sampled, word_vocab_size, partition_strategy="div") else: logits = tf.matmul(output, tf.transpose(weights)) + biases loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self.y, [-1])], [tf.ones([batch_size * num_steps], dtype=tf.float32)]) self.cost = tf.reduce_sum(loss) / batch_size
def __init__(self, config, word_ix_to_char_ixs, need_reuse=False): # get hyperparameters batch_size = config.batch_size num_steps = config.num_steps self.max_word_len = max_word_len = config.max_word_len self.char_emb_dim = char_emb_dim = config.char_emb_dim self.highway_size = highway_size = config.highway_size self.init_scale = init_scale = config.init_scale num_sampled = config.num_sampled char_vocab_size = config.char_vocab_size hidden_size = config.hidden_size num_layers = config.num_layers word_vocab_size = config.word_vocab_size drop_x = config.drop_x drop_i = config.drop_i drop_h = config.drop_h drop_o = config.drop_o filter_widths = config.filter_widths filters_per_width = config.filters_per_width cnn_output_dim = config.cnn_output_dim # charlable embedding matrix with tf.variable_scope('char_emb', reuse=need_reuse): self.char_embedding = tf.get_variable("char_embedding", [char_vocab_size, char_emb_dim], dtype=tf.float32) # placeholders for training data and labels self.x = tf.placeholder(tf.int32, [batch_size, num_steps, max_word_len]) self.y = tf.placeholder(tf.int32, [batch_size, num_steps]) y_float = tf.cast(self.y, tf.float32) # we first embed characters ... words_embedded = tf.nn.embedding_lookup(self.char_embedding, self.x) words_embedded = tf.reshape(words_embedded, [-1, max_word_len, char_emb_dim]) def conv_layer(cur_char_inputs, filt_shape, bias_shape): new_filt_shape = [1, 1] + filt_shape filt = tf.get_variable('filt', new_filt_shape) bias = tf.get_variable('bias', bias_shape) cur_char_inputs = tf.expand_dims(tf.expand_dims(cur_char_inputs, 1), 1) conv = tf.nn.conv3d(cur_char_inputs, filt, [1, 1, 1, 1, 1], padding='VALID') feature_map = tf.nn.tanh(conv + bias) feature_map_reshaped = tf.squeeze(feature_map, axis=1) pool = tf.nn.max_pool(feature_map_reshaped, [1, 1, max_word_len - filt_shape[0] + 1, 1], [1, 1, 1, 1], 'VALID') return(tf.squeeze(pool, axis=[1,2])) def words_filter(cur_char_inputs): pools = [] for w in filter_widths: with tf.variable_scope('filter' + str(w)): pools.append(conv_layer(cur_char_inputs, [w, char_emb_dim, filters_per_width[w]], [filters_per_width[w]])) return tf.concat(axis=1, values=pools) with tf.variable_scope('cnn_output', reuse=need_reuse) as scope: cnn_output = tf.reshape(words_filter(words_embedded), [-1, cnn_output_dim]) # we feed the word vector into a stack of two HW layers ... with tf.variable_scope('highway1', reuse=need_reuse): highw1_output = self.highway_layer(cnn_output) with tf.variable_scope('highway2', reuse=need_reuse): highw2_output = self.highway_layer(highw1_output) highw_output_reshaped = tf.reshape(highw2_output, [batch_size, num_steps, -1]) if not need_reuse: highw_output_reshaped = tf.nn.dropout( highw_output_reshaped, 1-drop_x, [batch_size, num_steps, 1]) # ... and then process it with a stack of two LSTMs lstm_input = tf.unstack(highw_output_reshaped, axis=1) # basic LSTM cell def lstm_cell(): return tf.nn.rnn_cell.LSTMCell(hidden_size, forget_bias=1.0, reuse=need_reuse) cells = [] for i in range(num_layers): with tf.variable_scope('layer' + str(i)): if not need_reuse: if i == 0: cells.append( my_dropout.MyDropoutWrapper(lstm_cell(), input_keep_prob=1-drop_i, state_keep_prob=1-drop_h, output_keep_prob=1-drop_o, variational_recurrent=True, input_size=highway_size, dtype=tf.float32)) else: cells.append( my_dropout.MyDropoutWrapper(lstm_cell(), state_keep_prob=1-drop_h, output_keep_prob=1-drop_o, variational_recurrent=True, input_size=hidden_size, dtype=tf.float32)) else: cells.append(lstm_cell()) self.cell = tf.nn.rnn_cell.MultiRNNCell(cells) self.init_state = self.cell.zero_state(batch_size, dtype=tf.float32) with tf.variable_scope('lstm_rnn', reuse=need_reuse): outputs, self.state = tf.contrib.rnn.static_rnn( self.cell, lstm_input, dtype=tf.float32, initial_state=self.init_state) output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, hidden_size]) # finally we predict the next word according to a softmax normalization if config.reuse_emb: self.char_embedding_out = self.char_embedding with tf.variable_scope('softmax_params', reuse=need_reuse): if highway_size != hidden_size: proj2_w_out = tf.get_variable('proj2_w_out', [highway_size, hidden_size], dtype=tf.float32) if not config.reuse_emb: self.char_embedding_out = tf.get_variable("char_embedding_out", [char_vocab_size, char_emb_dim], dtype=tf.float32) biases = tf.get_variable('biases', [word_vocab_size], dtype=tf.float32) slice_num = word_vocab_size // 1000 for i in range(slice_num): word_in_chars = [] a = i * 1000 b = i * 1000 + 1000 if i != slice_num - 1 else word_vocab_size for word_ix in range(a, b): word_in_chars.append(word_ix_to_char_ixs[word_ix]) word_in_chars_embedded = tf.nn.embedding_lookup(self.char_embedding_out, word_in_chars) with tf.variable_scope('cnn_output' if config.reuse_cnn else 'cnn_output_out', reuse=config.reuse_cnn or (i > 0) or need_reuse): weight_full = words_filter(word_in_chars_embedded) with tf.variable_scope('highway1' if config.reuse_hw1 else 'highway1_out', reuse=config.reuse_hw1 or (i > 0) or need_reuse): weight_highw1_output = self.highway_layer(weight_full) with tf.variable_scope('highway2' if config.reuse_hw2 else 'highway2_out', reuse=config.reuse_hw2 or (i > 0) or need_reuse): weight_highw2_output = self.highway_layer(weight_highw1_output) if highway_size != hidden_size: weight = tf.matmul(weight_highw2_output, proj2_w_out) else: weight = weight_highw2_output if i == 0: weights = weight else: weights = tf.concat(values=[weights, weight], axis=0) del word_in_chars[:] self.weights = weights # and compute the cross-entropy between labels and predictions if config.ssm == 1 and not need_reuse: loss = tf.nn.sampled_softmax_loss(weights, biases, tf.reshape(y_float, [-1, 1]), output, num_sampled, word_vocab_size, partition_strategy="div") else: logits = tf.matmul(output, tf.transpose(weights)) + biases loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self.y, [-1])], [tf.ones([batch_size * num_steps], dtype=tf.float32)]) self.cost = tf.reduce_sum(loss) / batch_size