def reshape_state(state, other_shapes): with tf.name_scope(sys._getframe().f_code.co_name): if isinstance(state, LSTMStateTuple): new_c = tf.reshape(state.c, other_shapes + [shape(state.c, -1)]) new_h = tf.reshape(state.h, other_shapes + [shape(state.h, -1)]) state = LSTMStateTuple(c=new_c, h=new_h) else: state = tf.reshape(state, other_shapes + [shape(state, -1)]) return state
def add_byway_attn_state(self, attention_states, attention_lengths): with tf.name_scope('add_byway_attn_state'): batch_size = shape(attention_states, 0) state_size = shape(attention_states, -1) byway_state = tf.get_variable('byway_state', [state_size]) # [state_size] byway_state = tf.expand_dims(byway_state, 0) # [1, state_size] byway_state = tf.expand_dims(byway_state, 0) # [1, 1, state_size] byway_state = tf.tile(byway_state, [batch_size, 1, 1]) # [1, 1, state_size] attention_states = tf.concat([byway_state, attention_states], axis=1) attention_lengths += 1 return attention_states, attention_lengths
def setup_birnn(inputs, sequence_length, cell_type, hidden_size, use_residual, keep_prob): with tf.name_scope(sys._getframe().f_code.co_name): batch_size = shape(inputs, 0) # For 'initial_state' of CustomLSTMCell, different scopes are required in these initializations. with tf.variable_scope('fw_cell'): cell_fw = setup_cell(cell_type, hidden_size, use_residual, keep_prob=keep_prob) initial_state_fw = cell_fw.initial_state(batch_size) if hasattr( cell_fw, 'initial_state') else None with tf.variable_scope('bw_cell'): cell_bw = setup_cell(cell_type, hidden_size, use_residual, keep_prob=keep_prob) initial_state_bw = cell_bw.initial_state(batch_size) if hasattr( cell_bw, 'initial_state') else None outputs, state = rnn.bidirectional_dynamic_rnn( cell_fw, cell_bw, inputs, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, sequence_length=sequence_length, dtype=tf.float32) return outputs, state
def define_combination(self, all_models): ''' Define adversarial layers. *Note* this function must be executed after all other models were defined. ''' adv_models, input_repls, output_label_ids = self.set_label_by_model( all_models) n_labels = max(output_label_ids) + 1 gradients = [] # dbgprint(adv_models) # dbgprint(input_repls) # dbgprint(output_label_ids) loss_by_model = [] gradients_by_model = [] for model, input_repl, output_id in zip(adv_models, input_repls, output_label_ids): # To ensure the adversarial learning using outputs from a task is assigned to the same GPU. task_idx = all_models.index(model) # device = assign_device(task_idx) # sys.stderr.write('Defining adversarial layer in %s...\n' % (device)) # with tf.device(device): with tf.name_scope('adversarial'): hidden = flip_gradient(input_repl) for depth in range(self.config.ffnn_depth - 1): with tf.variable_scope('hidden%d' % (depth + 1)) as scope: hidden = linear(hidden, shape(hidden, -1), scope=scope) hidden = tf.nn.dropout(hidden, self.keep_prob) with tf.variable_scope('output') as scope: logits = linear(hidden, n_labels, activation=None, scope=scope) #logits = tf.reshape(logits, [-1, n_labels]) tiled_output_label_id = tf.tile([output_id], [shape(logits, 0)]) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tiled_output_label_id, logits=logits)) gradients = self.compute_gradients(self.loss_weight * loss) loss_by_model.append(loss) gradients_by_model.append(gradients) loss = tf.reduce_mean(loss_by_model) gradients = average_gradients(gradients_by_model) return loss, gradients
def setup_decoder_cell(self, config, keep_prob, use_beam_search, init_state, attention_states, attention_lengths): batch_size = get_state_shape(init_state)[0] if use_beam_search: attention_states = tile_batch(attention_states, multiplier=self.beam_width) init_state = nest.map_structure( lambda s: tile_batch(s, self.beam_width), init_state) attention_lengths = tile_batch(attention_lengths, multiplier=self.beam_width) batch_size = batch_size * self.beam_width attention_size = shape(attention_states, -1) attention = getattr(tf.contrib.seq2seq, config.attention_type)( attention_size, attention_states, memory_sequence_length=attention_lengths) def cell_input_fn(inputs, attention): # define cell input function to keep input/output dimension same if not config.use_attention_input_feeding: return inputs attn_project = tf.layers.Dense(config.hidden_size, dtype=tf.float32, name='attn_input_feeding', activation=self.activation) return attn_project(tf.concat([inputs, attention], axis=-1)) cells = _setup_decoder_cell(config, keep_prob) if config.top_attention: # apply attention mechanism only on the top decoder layer cells[-1] = AttentionWrapper( cells[-1], attention_mechanism=attention, name="AttentionWrapper", attention_layer_size=config.hidden_size, alignment_history=use_beam_search, initial_cell_state=init_state[-1], cell_input_fn=cell_input_fn) init_state = [state for state in init_state] init_state[-1] = cells[-1].zero_state(batch_size=batch_size, dtype=tf.float32) init_state = tuple(init_state) cells = MultiRNNCell(cells) else: cells = MultiRNNCell(cells) cells = AttentionWrapper(cells, attention_mechanism=attention, name="AttentionWrapper", attention_layer_size=config.hidden_size, alignment_history=use_beam_search, initial_cell_state=init_state, cell_input_fn=cell_input_fn) init_state = cells.zero_state(batch_size=batch_size, dtype=tf.float32) \ .clone(cell_state=init_state) return cells, init_state
def get_state_shape(state): ''' Return the size of an encoder-state. If 'state' is a list of states, return that of the first one. ''' def _get_lstm_state_size(state): return [shape(state.h, i) for i in range(len(state.h.get_shape()))] if nest.is_sequence(state): if isinstance(state[0], LSTMStateTuple): return _get_lstm_state_size(state[0]) else: return [ shape(state[0], i) for i in range(len(state[0].get_shape())) ] else: if isinstance(state, LSTMStateTuple): return _get_lstm_state_size(state) else: return [shape(state, i) for i in range(len(state.get_shape()))]
def char_encode(self, inputs): ''' Args: - inputs: [*, max_sequence_length, max_word_length] Return: - outputs: [*, max_sequence_length, cnn_output_size] ''' if inputs is None: return inputs with tf.variable_scope(self.scope or "WordEncoder"): # Flatten the input tensor to each word (rank-3 tensor). with tf.name_scope('flatten'): char_repls = tf.nn.embedding_lookup( self.embeddings.char, inputs) # [*, max_word_len, char_emb_size] other_shapes = [ shape(char_repls, i) for i in range(len(char_repls.get_shape()[:-2])) ] flattened_batch_size = reduce(lambda x, y: x * y, other_shapes) max_sequence_len = shape(char_repls, -2) char_emb_size = shape(char_repls, -1) flattened_char_repls = tf.reshape( char_repls, [flattened_batch_size, max_sequence_len, char_emb_size]) cnn_outputs = cnn(flattened_char_repls ) # [flattened_batch_size, cnn_output_size] outputs = tf.reshape( cnn_outputs, other_shapes + [shape(cnn_outputs, -1)]) # [*, cnn_output_size] outputs = tf.nn.dropout(outputs, self.keep_prob) return outputs
def add_bos_and_eos(self, target, start_token, end_token): with tf.name_scope('add_BOS_and_EOS'): # add start_token (end_token) to decoder's input (output). batch_size = shape(target, 0) with tf.name_scope('start_tokens'): start_tokens = tf.tile( tf.constant([start_token], dtype=tf.int32), [batch_size]) with tf.name_scope('end_tokens'): end_tokens = tf.tile(tf.constant([end_token], dtype=tf.int32), [batch_size]) dec_input_tokens = tf.concat( [tf.expand_dims(start_tokens, 1), target], axis=1) dec_output_tokens = tf.concat( [target, tf.expand_dims(end_tokens, 1)], axis=1) return dec_input_tokens, dec_output_tokens
def extend_vocab_for_oov(embeddings, inputs, unk_id): ''' Copy the embeddings of OOV to expand word embedding matrix by the number of unique OOV words (mainly for CopyNet) <Args> - embeddings: A Tensor ([vocab_size, emb_size]). - inputs: - unk_id: An integer. ''' with tf.name_scope(sys._getframe().f_code.co_name): unk_emb = tf.expand_dims(embeddings[unk_id, :], 0) # [1, emb_size] num_oov_words = tf.maximum( tf.reduce_max(inputs) - shape(embeddings, 0), 0) oov_embeddings = tf.tile( unk_emb, [num_oov_words, 1]) # [num_oov_words, emb_size] extended_embeddings = tf.concat([embeddings, oov_embeddings], axis=0) return extended_embeddings
def decode_train(self, dec_input_tokens, dec_lengths, init_state, *attention_args, decoder_class=BasicDecoder, decoder_kwoptions={}): ''' <Args> - dec_input_tokens: - dec_length: - init_state: - decoder_class: - decoder_options: ''' with tf.variable_scope(self.scope or "Decoder") as scope: train_cell, init_state = self.setup_decoder_cell( self.config, self.keep_prob, False, init_state, *attention_args) self.input_project = tf.layers.Dense(units=self.config.hidden_size, name="input_projection", activation=self.activation) if hasattr(self.config, 'use_emb_as_out_proj') and \ self.config.use_emb_as_out_proj == True: # Make the dim of decoder's output be hidden_size to emb_size. emb_project = tf.layers.Dense(units=self.config.hidden_size, use_bias=False, activation=None, name='emb_projection') output_kernel = emb_project(self.embeddings) output_kernel = tf.transpose(output_kernel) self.output_project = SharedKernelDense( units=shape(self.embeddings, 0), shared_kernel=output_kernel, use_bias=False, activation=None, name='output_projection') else: self.output_project = tf.layers.Dense(units=shape( self.embeddings, 0), name='output_projection', use_bias=False, activation=None) #use_bias=False, trainable=False) # self.output_project = tf.layers.Dense(units=shape(self.embeddings, 0), # name='output_projection') with tf.name_scope('Train'): inputs = tf.nn.embedding_lookup(self.embeddings, dec_input_tokens) inputs = self.input_project(inputs) inputs = tf.nn.dropout(inputs, self.keep_prob) helper = TrainingHelper(inputs, sequence_length=dec_lengths, time_major=False) train_decoder = decoder_class(train_cell, helper, init_state, output_layer=self.output_project, **decoder_kwoptions) max_dec_len = tf.reduce_max(dec_lengths, name="max_dec_len") outputs, final_state, _ = dynamic_decode( train_decoder, impute_finished=True, maximum_iterations=max_dec_len, scope=scope) logits = outputs.rnn_output # To prevent the training loss to be NaN. logits += 1e-9 logits = tf.clip_by_value(logits, -20.0, 20.0, name='clip_logits') self.train_decoder = train_decoder return logits, final_state
def _get_lstm_state_size(state): return [shape(state.h, i) for i in range(len(state.h.get_shape()))]
def encode(self, inputs, sequence_length): # , merge_func=tf.reduce_mean): config = self.config with tf.variable_scope(self.scope or "RNNEncoder") as scope: if isinstance(inputs, list): inputs = [x for x in inputs if x is not None] sent_repls = tf.concat( inputs, axis=-1) # [*, max_sequence_len, hidden_size] else: sent_repls = inputs # Flatten the input tensor to a rank 3 tensor ([*, max_sequence_len, hidden_size]), to handle inputs with more than 3 rank. (e.g. context as list of utterances) input_hidden_size = shape(sent_repls, -1) max_sequence_len = shape(sent_repls, -2) other_shapes = [ shape(sent_repls, i) for i in range(len(sent_repls.get_shape()[:-2])) ] flattened_batch_size = reduce(lambda x, y: x * y, other_shapes) flattened_shape = [ flattened_batch_size, max_sequence_len, input_hidden_size ] flattened_sent_repls = tf.reshape(sent_repls, flattened_shape) flattened_sequence_length = tf.reshape(sequence_length, [flattened_batch_size]) inputs = flattened_sent_repls # Project input before the main RNN, to keep the dims of inputs equal to hidden_size in both cases of using birnn or not. input_project = tf.layers.Dense(units=self.config.hidden_size, dtype=tf.float32, name="input_projection", activation=self.activation) inputs = input_project(inputs) inputs = tf.nn.dropout(inputs, self.keep_prob) birnn_state = [] if self.config.num_layers.birnn > 0: for i in range(self.config.num_layers.birnn): with tf.variable_scope('BiRNN/L%d' % i): use_residual = self.config.use_residual if i > 0 else False outputs, state = setup_birnn( inputs, flattened_sequence_length, config.cell_type, config.hidden_size, config.use_residual, self.keep_prob) # Concat and project the outputs and the state from BiRNN to hidden_size. state = merge_state(state, tf.concat) state = project_state(state, self.config.hidden_size) birnn_state.append(state) outputs = tf.concat(outputs, axis=-1) output_project = tf.layers.Dense( units=self.config.hidden_size, dtype=tf.float32, name="output_projection", activation=self.activation) outputs = output_project(outputs) outputs = tf.nn.dropout(outputs, self.keep_prob) inputs = outputs rnn_state = [] if self.config.num_layers.rnn > 0: with tf.variable_scope('RNN'): #cells = self.setup_encoder_cell(self.config, self.keep_prob) # outputs, state = rnn.dynamic_rnn( # cells, inputs, # sequence_length=flattened_sequence_length, dtype=tf.float32) outputs, rnn_state = setup_rnn( inputs, flattened_sequence_length, config.cell_type, config.hidden_size, config.use_residual, self.keep_prob, config.num_layers.rnn) # Turn the shape of outputs and state back. outputs = tf.reshape( outputs, other_shapes + [max_sequence_len, shape(outputs, -1)]) state = list(birnn_state) + list(rnn_state) state = tuple([reshape_state(s, other_shapes) for s in state]) return outputs, state