def parse(self, x, context, is_training): with tf.variable_scope(self.scope): # Input RNN in_rnn = CudnnLSTM(1, 128, direction=CUDNN_RNN_BIDIRECTION, name="in_rnn") h_in, _ = in_rnn(tf.transpose(x, [1, 0, 2])) h_in = tf.reshape( tf.transpose(h_in, [1, 0, 2]), (self.bs, self.seq_in, 1, 256)) # (bs, seq_in, 1, 128) # Output RNN out_input = tf.zeros( (self.seq_out, self.bs, 1)) # consider teacher forcing. out_rnn = CudnnLSTM(1, 128, name="out_rnn") h_out, _ = out_rnn(out_input) h_out = tf.reshape( tf.transpose(h_out, [1, 0, 2]), (self.bs, 1, self.seq_out, 128)) # (bs, 1, seq_out, 128) # Bahdanau attention att = tf.nn.tanh( layers.fully_connected(h_out, 128, activation_fn=None) + layers.fully_connected(h_in, 128, activation_fn=None)) att = layers.fully_connected( att, 1, activation_fn=None) # (bs, seq_in, seq_out, 1) att = tf.nn.softmax(att, axis=1) # (bs, seq_in, seq_out, 1) attended_h = tf.reduce_sum(att * h_in, axis=1) # (bs, seq_out, 128) p_gen = layers.fully_connected( attended_h, 1, activation_fn=tf.nn.sigmoid) # (bs, seq_out, 1) p_copy = (1 - p_gen) # Generate gen = layers.fully_connected( attended_h, self.n_out, activation_fn=None) # (bs, seq_out, n_out) gen = tf.reshape(gen, (self.bs, self.seq_out, self.n_out)) # Copy copy = tf.log( tf.reduce_sum( att * tf.reshape(x, (self.bs, self.seq_in, 1, self.n_out)), axis=1) + 1e-8) # (bs, seq_out, n_out) output_logits = p_copy * copy + p_gen * gen return output_logits
def _build_rnn(self, name, is_cuda, rnn_dim, inputs, state_dropout_rate, output_dropout_rate): with tf.variable_scope(name): if is_cuda: lstm_cell = CudnnLSTM(num_layers=1, num_units=rnn_dim, direction='bidirectional') outputs, _ = lstm_cell(inputs) else: state_keep_prob = 1. - state_dropout_rate * tf.cast( self._is_training, tf.float32) with tf.variable_scope('cudnn_lstm'): single_cell = lambda: DropoutWrapper( CudnnCompatibleLSTMCell(rnn_dim), state_keep_prob=state_keep_prob, variational_recurrent=True, input_size=inputs.get_shape()[-1], dtype=tf.float32) outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( [single_cell()], [single_cell()], inputs, time_major=True, dtype=tf.float32) outputs = tf.concat(outputs, axis=-1) outputs = tf.layers.dropout(outputs, output_dropout_rate, training=self._is_training, noise_shape=tf.concat( [[1], tf.shape(outputs)[1:]], axis=0)) return outputs
def check(**kwargs): print("kwargs:", kwargs) model = CudnnLSTM(**kwargs) params = tf.Variable(tf.random_uniform([model.params_size()]), validate_shape=False) session.run(params.initializer) s1 = model.params_size().eval() print("param size:", s1) # s2 = sum([wts.eval().shape[0] for wtss in model.params_to_canonical(params) for wts in wtss]) weights, biases = model.params_to_canonical(params) for p in weights: print("weight:", p, "shape:", tf.shape(p).eval()) for p in biases: print("bias:", p, "shape:", tf.shape(p).eval()) s2 = sum([tf.reduce_prod(tf.shape(p)).eval() for p in weights + biases]) print("summed up size:", s2) assert_equal(s1, s2)
def BiLSTM(x, filters, dropout=0.0, name='BiLSTM', layers=1, return_state=False): cudnn_lstm = CudnnLSTM(layers, filters, direction='bidirectional', name=name) if type(x) == list: assert len(x) == 2 x1, x2 = x # cudnn compatibility: time first, batch second x1 = tf.transpose(x1, [1, 0, 2]) x2 = tf.transpose(x2, [1, 0, 2]) x1, x1_state = cudnn_lstm(x1) # state:[2, bs, dim] x2, x2_state = cudnn_lstm(x2) x1 = tf.transpose(x1, [1, 0, 2]) x2 = tf.transpose(x2, [1, 0, 2]) x1_state = tf.concat(tf.unstack(x1_state[0], axis=0), axis=-1) x2_state = tf.concat(tf.unstack(x2_state[0], axis=0), axis=-1) if return_state: return tf.nn.dropout(x1_state, 1 - dropout), tf.nn.dropout(x2_state, 1 - dropout) else: return tf.nn.dropout(x1, 1 - dropout), tf.nn.dropout(x2, 1 - dropout) else: # cudnn compatibility: time first, batch second x = tf.transpose(x, [1, 0, 2]) x, x_state = cudnn_lstm(x) if return_state: x_state = tf.concat(tf.unstack(x_state[0], axis=0), axis=-1) return tf.nn.dropout(x_state, 1 - dropout) else: x = tf.transpose(x, [1, 0, 2]) return tf.nn.dropout(x, 1 - dropout)
def __init__(self, GPU, num_layers, num_units, dropout=0., dtype=tf.dtypes.float32, name=None): ''' create a lstm adapter. equal to `LSTMBlockFusedCell` if GPU, else `CudnnLSTM`. ''' base_layer.Layer.__init__(self, dtype=dtype, name=name) self.GPU = GPU self.dropout = dropout if GPU: self.model = CudnnLSTM(num_layers, num_units, dtype=self.dtype, name=name) else: self.model = MultiFusedRNNCell([ LSTMBlockFusedCell(num_units, dtype=self.dtype, name='%s_%d' % (name, i)) for i in range(num_layers) ])
def build_cudnn_lstm(inps, num_layers, num_units): lstm = CudnnLSTM( num_layers, num_units, input_mode='linear_input', ) output, _ = lstm(inps) return output
def get_lstm_outputs2(self, chars, last_state=None, reuse=False): with tf.variable_scope('char_embedding', reuse=reuse): self.char_embedding = tf.get_variable('char_embedding', initializer=tf.orthogonal_initializer()( (self.NUM_CHARS, self.CHAR_EMBEDDING_SIZE)), dtype=tf.float32) out = tf.nn.embedding_lookup(self.char_embedding, chars) with tf.variable_scope('spam_gen_rnn', reuse=reuse): cud = CudnnLSTM(self.LAYERS, self.HIDDEN_LAYER_SIZE, self.CHAR_EMBEDDING_SIZE, dropout=0.5) out, a, b = cud(out, None, None, {}) return out, (a, b)
def build_stacked_cudnn_lstm(inps, num_layers, num_units): lstms = [ CudnnLSTM( 1, num_units, input_mode='linear_input', ) for _ in range(num_layers) ] inter = inps for lstm in lstms: inter, _ = lstm(inter) return inter
def add_cudnn_lstm(inps, state, num_layers, num_units, input_dim, init_parameter): input_dim = max(input_dim, num_units) stddevs = compute_stddevs([num_units], input_dim, init_parameter) lstm = CudnnLSTM( num_layers, num_units, input_mode='linear_input', kernel_initializer=tf.truncated_normal_initializer(stddev=stddevs[0])) state = prepare_init_state(state, inps, lstm, 'cudnn') output, state = lstm(inps, initial_state=state) return output, state
def cudnn_lstm_module(input, name, train, units, recomp=False): """ CUDNN LSTM module :param input: input tensor :param name: name for variable / scope :param train: is_train placeholder :param units: number of LSTM units :param recomp: whether used in recompute_gradient environment :return: output tensor after LSTM """ should_learn = tf.logical_and(train, tf.logical_not(recomp)) class BiasInit: """ Custom initialization for LSTM bias init """ def __init__(self, init): self.count = 0 self.init = init def __call__(self, shape, dtype): if self.count >= len(self.init): self.count = 0 cop = tf.constant(self.init[self.count], dtype=dtype, shape=shape) self.count += 1 return cop lstm = CudnnLSTM( num_layers=1, dtype=tf.float32, num_units=units, direction='unidirectional', name=name, kernel_initializer=tf.contrib.layers.xavier_initializer(uniform=True), bias_initializer=BiasInit([0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]) # initialize forget gate bias to 1.0 # according to [An empirical exploration of recurrent network architectures, Jozefowicz et al., ICML'15] # https://github.com/keras-team/keras/blob/04cbccc8038c105374eef6eb2ce96d6746999860/keras/layers/cudnn_recurrent.py#L448 ) a = input # lstm swaps batch and time dimension a = tf.transpose(a, perm=[1, 0, 2]) a, c = lstm(a, training=True) # swap back lstm batch and time dimension a = tf.transpose(a, perm=[1, 0, 2]) return a
def add_stacked_cudnn_lstm(inps, state, num_units, input_dim, init_parameter): stddevs = compute_stddevs(num_units, input_dim, init_parameter) lstms = [ CudnnLSTM( 1, nu, input_mode='linear_input', kernel_initializer=tf.truncated_normal_initializer(stddev=stddev)) for nu, stddev in zip(num_units, stddevs) ] state = prepare_init_state(state, inps, lstms, 'cudnn_stacked') inter = inps new_state = list() for lstm, s in zip(lstms, state): inter, new_s = lstm(inter, initial_state=s) new_state.append(s) return inter, new_state
def unrolled_rnn(self, inputs, lengths): if not self.use_cudnn_rnn: cell = self.cell() logits, state = tf.nn.dynamic_rnn(cell, inputs, sequence_length=lengths, dtype=self.FLOAT_TYPE, time_major=self.time_major_optimization, swap_memory=self.dynamic_rnn_swap_memory) else: rnn = CudnnLSTM(self.rnn_num_layers, self.rnn_num_units) from layers_utils import AffineProjectionLayer proj = AffineProjectionLayer(self.rnn_num_units, self.vocab_size, self.FLOAT_TYPE) inputs = tf.transpose(inputs, (1,0,2)) out, state = rnn(inputs) out = tf.transpose(out, (1,0,2)) logits = proj(out) logits = logits * tf.expand_dims(self.cost_mask(lengths, self.max_length(), False),-1) return logits, state
def get_cell(cell_type, size, layers=1, direction='unidirectional'): if cell_type == "layer_norm_basic": cell = LayerNormBasicLSTMCell(size) elif cell_type == "lstm_block_fused": cell = tf.contrib.rnn.LSTMBlockFusedCell(size) elif cell_type == "cudnn_lstm": cell = CudnnLSTM(layers, size, direction=direction) elif cell_type == "cudnn_gru": cell = CudnnGRU(layers, size, direction=direction) elif cell_type == "lstm_block": cell = LSTMBlockCell(size) elif cell_type == "gru_block": cell = GRUBlockCell(size) elif cell_type == "rnn": cell = BasicRNNCell(size) elif cell_type == "cudnn_rnn": cell = CudnnRNNTanh(layers, size) else: cell = BasicLSTMCell(size) return cell
def build(self, input_shape): with tf.variable_scope(self.name, reuse=self.reuse): self.weights = [] for idx, layer in enumerate(self.rnn_layers): if self.is_cpu: self.is_training = False # Only use cpu in inference mode for now cell = CudnnCompatibleLSTMCell(num_units=layer['units']) cell.build(tf.TensorShape( input_shape[1:])) # Require 2 dimension only else: cell = CudnnLSTM(num_layers=1, num_units=layer['units'], input_mode='linear_input', direction='unidirectional', dropout=0.0) cell.build(input_shape) weight = {'cell': cell} wdrop = layer.get('wdrop', 0.0) if self.is_training and wdrop > 0.0: h_var_backup = tf.Variable(initial_value=tf.zeros( shape=[4 * layer['units'], layer['units']]), trainable=False, name='h_var_backup_' + str(idx)) weight['h_var_backup'] = h_var_backup if isinstance(self.projection_dims, int) and self.projection_dims > 0: w_proj = tf.get_variable( name='w_proj_{}'.format(idx), shape=(layer['units'], self.projection_dims), initializer=tf.glorot_uniform_initializer()) b_proj = tf.get_variable( name='b_proj_{}'.format(idx), shape=(self.projection_dims, ), initializer=tf.zeros_initializer()) input_shape = (None, None, self.projection_dims) weight['w_proj'] = w_proj weight['b_proj'] = b_proj else: input_shape = (None, None, layer['units']) self.weights.append(weight)
import os import tensorflow as tf from tensorflow.contrib.cudnn_rnn import CudnnLSTM as CudnnLSTM inp = tf.zeros([10, 32, 100]) lstm1 = CudnnLSTM(1, 128) lstm2 = CudnnLSTM(2, 256) lstm1.build(inp.shape) lstm2.build(inp.shape) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) save_path = 'test_cudnn_lstm_save/1' if not os.path.exists(save_path): os.makedirs(os.path.join(save_path)) saver.save(sess, save_path)
def __build_uni_model(inputs, name): model = {} with tf.variable_scope(name, reuse=self.reuse): s = tf.shape(inputs) # Get input shape # Reshape from [T, B, C] to [T * B, C] inputs = tf.reshape(inputs, [s[0] * s[1], s[2]]) with tf.device('/cpu:0'): W = tf.get_variable( shape=[self.char_vocab_size, self.char_vec_size], initializer=tf.glorot_uniform_initializer(), name="embedding_weight") if self.is_training and self.drop_e > 0.0: W = embedding_dropout(W, dropout=self.drop_e) char_embed = tf.nn.embedding_lookup(W, inputs) conv_out = [] for fsz, num in self.char_cnn_layers: x = tf.layers.conv1d( char_embed, num, fsz, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer(), padding='same') x = tf.reduce_max(x, axis=1) conv_out.append(x) embedding = tf.concat(conv_out, axis=-1) embedding = tf.reshape( embedding, (s[0], s[1], sum(x for _, x in self.char_cnn_layers))) input_shape = s ops = [] inputs = embedding layer_outputs = [] for idx, l in enumerate(self.rnn_layers): cell = CudnnLSTM(num_layers=1, num_units=l['units'], input_mode='linear_input', direction='unidirectional', dropout=0.0) saved_state = (tf.get_variable( shape=[1, 1, l['units']], name='c_' + str(idx), trainable=False), tf.get_variable( shape=[1, 1, l['units']], name='h_' + str(idx), trainable=False)) for x in saved_state: tf.add_to_collection(LSTM_SAVED_STATE, x) zeros = tf.zeros([1, input_shape[1], l['units']], dtype=tf.float32) zero_state = (zeros, zeros) def if_true(): return zero_state def if_false(): return saved_state drop_i = l.get('drop_i', 0.0) if self.is_training and drop_i > 0.0: inputs = tf.nn.dropout(x=inputs, keep_prob=1 - drop_i, noise_shape=[ 1, input_shape[1], inputs.shape[-1] ], name='drop_i_' + str(idx)) cell.build(inputs.shape) wdrop = l.get('wdrop', 0.0) if self.is_training and wdrop > 0.0: cell_var = cell.variables[0] h_var_backup = tf.Variable(initial_value=tf.zeros( shape=[4 * l['units'], l['units']]), trainable=False, name='h_var_backup_' + str(idx)) h_var = cell_var[inputs.shape[-1] * l['units'] * 4:-l['units'] * 8] h_var = tf.reshape( h_var, [4 * l['units'], l['units']]) + h_var_backup keep_prob = 1 - wdrop random_tensor = keep_prob random_tensor += tf.random_uniform( [4 * l['units'], 1], dtype=h_var.dtype) # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob) binary_tensor = tf.floor(random_tensor) new_h_var = tf.multiply(h_var, binary_tensor) new_h_var = tf.reshape( new_h_var, [4 * l['units'] * l['units']]) h_var_backup = tf.assign( h_var_backup, tf.multiply(h_var, tf.subtract(1.0, binary_tensor)), validate_shape=True, use_locking=True, name='assign_h_var_backup_' + str(idx)) new_cell_var = tf.concat([ cell_var[:inputs.shape[-1] * l['units'] * 4], new_h_var, cell_var[-l['units'] * 8:] ], axis=0, name='new_cell_var_' + str(idx)) op = tf.assign(cell_var, new_cell_var, validate_shape=True, use_locking=True, name='assign_new_cell_var_' + str(idx)) with tf.control_dependencies([op, h_var_backup]): outputs, state = cell.call( inputs=inputs, initial_state=tf.cond( self.reset_state, if_true, if_false), training=self.is_training) else: outputs, state = cell.call( inputs=inputs, initial_state=tf.cond(self.reset_state, if_true, if_false), training=self.is_training) if isinstance(self.fine_tune_lr, list): outputs = apply_custom_lr(outputs, self.fine_tune_lr[idx]) drop_o = l.get('drop_o', 0.0) if self.is_training and drop_o > 0.0: outputs = tf.nn.dropout(x=outputs, keep_prob=1 - drop_o, noise_shape=[ 1, input_shape[1], outputs.shape[-1] ], name='drop_o_' + str(idx)) ops.append( tf.assign(saved_state[0], state[0], validate_shape=False)) ops.append( tf.assign(saved_state[1], state[1], validate_shape=False)) inputs = outputs layer_outputs.append(outputs) model['layer_outputs'] = layer_outputs ops = tf.group(ops) with tf.control_dependencies([ops]): rnn_outputs = tf.multiply(inputs, tf.expand_dims( self.seq_masks, axis=-1), name='rnn_outputs') model['rnn_outputs'] = rnn_outputs decoder = tf.nn.xw_plus_b( tf.reshape(rnn_outputs, [ input_shape[0] * input_shape[1], self.rnn_layers[-1]['units'] ]), self.share_decode_W, self.share_decode_b) decoder = tf.reshape( decoder, [input_shape[0], input_shape[1], self.vocab_size]) model['decoder'] = decoder return model
def BiLSTM(x, filters, dropout=0.0, name='BiLSTM'): cudnn_lstm = CudnnLSTM(1, filters, direction='bidirectional', name=name) x, _ = cudnn_lstm(x) x = tf.nn.dropout(x, 1 - dropout) return x