def decoder(inputs, encoder_outputs, is_training, batch_size, mel_targets): """ Decoder Prenet -> Attention RNN Postprocessing CBHG @param encoder_outputs outputs from the encoder wtih shape [N, T_in, prenet_depth=256] @param inputs int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs @param is_training flag for training or eval @param batch_size number of samples per batch @param mel_targets float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel @param output_cell attention cell @param decoder_init_state initial state of the decoder @return linear_outputs, mel_outputs and alignments """ if (is_training): helper = TacoTrainingHelper(inputs, mel_targets, hparams.num_mels, hparams.outputs_per_step) else: helper = TacoTestHelper(batch_size, hparams.num_mels, hparams.outputs_per_step) # Attention attention_cell = AttentionWrapper( GRUCell(hparams.attention_depth), BahdanauAttention(hparams.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hparams.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hparams.decoder_depth), ResidualWrapper(GRUCell(hparams.decoder_depth)), ResidualWrapper(GRUCell(hparams.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hparams.num_mels * hparams.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hparams.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hparams.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hparams.num_mels, is_training, # [N, T_out, postnet_depth=256] hparams.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hparams.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) log('Decoder Network ...') log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hparams.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1]) return linear_outputs, mel_outputs, alignments
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False, locked_alignments=None, logs_enabled=True): """ Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output """ # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments if locked_alignments_ is not None: if np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder encoder_outputs = conv_and_gru( # [N, T_in, 2*encoder_gru_units=512] embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, gru_units_unidirectional=hp.encoder_gru_units, is_training=is_training, scope='encoder', ) # Attention attention_cell = LockableAttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper') # [N, T_in, attention_depth=256] # Concatenate attention context vector and RNN cell output into a # 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): cells = [ GRUCell(hp.decoder_gru_units) for _ in range(hp.decoder_gru_layers) ] decoder_cell = MultiRNNCell( [concat_cell] + cells, state_is_tuple=True) # [N, T_in, decoder_depth=1024] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) if is_training or gta: if hp.scheduled_sampling: helper = TacoScheduledOutputTrainingHelper( inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step, hp.scheduled_sampling_probability) else: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add Post-Processing Conv and GRU layer: expand_outputs = conv_and_gru( # [N, T_in, 2*expand_gru_units=512] pml_intermediates, None, conv_layers=hp.expand_conv_layers, conv_width=hp.expand_conv_width, conv_channels=hp.expand_conv_channels, gru_units_unidirectional=hp.expand_gru_units, is_training=is_training, scope='expand', ) pml_outputs = tf.layers.dense(expand_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_intermediates = pml_intermediates self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets if logs_enabled: log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(is_training)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' Decoder out (1 frame): {}'.format( pml_intermediates.shape[-1])) log(' Expand out: {}'.format( expand_outputs.shape[-1])) log(' PML out: {}'.format( pml_outputs.shape[-1]))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, stop_token_targets=None, global_step=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet(embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training, hp.encoder_depth) # [N, T_in, encoder_depth=256] # Location sensitive attention attention_mechanism = LocationSensitiveAttention(hp.attention_depth, encoder_outputs) # [N, T_in, attention_depth=256] # Decoder (layers specified bottom to top): multi_rnn_cell = MultiRNNCell([ ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step) # [N, T_out/r, M*r] # <stop_token> projection layer stop_projection = StopProjection(is_training, shape=hp.outputs_per_step) # [N, T_out/r, r] # Project onto r mel spectrograms (predict r outputs at each RNN step): decoder_cell = TacotronDecoderWrapper(is_training, attention_mechanism, multi_rnn_cell, frame_projection, stop_projection) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step, global_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, stop_token_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( CustomDecoder(decoder_cell, helper, decoder_init_state), maximum_iterations=hp.max_frame_num) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] stop_token_outputs = tf.reshape(stop_token_outputs, [batch_size, -1]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth) # [N, T_out, postnet_depth=256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.stop_token_outputs = stop_token_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.stop_token_targets = stop_token_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' prenet out: {}'.format(prenet_outputs.shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out (r frames): {}'.format(decoder_outputs.shape)) log(' decoder out (1 frame): {}'.format(mel_outputs.shape)) log(' postnet out: {}'.format(post_outputs.shape)) log(' linear out: {}'.format(linear_outputs.shape)) log(' stop token: {}'.format(stop_token_outputs.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, reference_mel=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None is_teacher_force_generating = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'text_embedding', [len(symbols), 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] if hp.use_gst: #Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, 256 // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] if is_training: reference_mel = mel_targets if reference_mel is not None: # Reference encoder refnet_outputs = reference_encoder( reference_mel, filters=[32, 32, 64, 64, 128, 128], kernel_size=(3, 3), strides=(2, 2), encoder_cell=GRUCell(128), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs if hp.use_gst: # Style attention style_attention = MultiheadAttention( tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1 ]), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=128, attention_type=hp.style_att_type) # Apply tanh to compress both encoder state and style embedding to the same scale. style_embeddings = tf.nn.tanh( style_attention.multi_head_attention()) # [N, 1, 256] else: style_embeddings = tf.expand_dims(refnet_outputs, axis=1) # [N, 1, 128] else: #raise ValueError("TODO: add weight when there is no reference during inference") print("Use random weight for GST.") random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, name="random_weights") style_embeddings = tf.nn.tanh( tf.matmul(random_weights, gst_tokens)) style_embeddings = tf.reshape( style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]]) # Add style embedding to every text encoder state style_embeddings = tf.tile( style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1) # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or is_teacher_force_generating: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.encoder_outputs = encoder_outputs self.style_embeddings = style_embeddings self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.reference_mel = reference_mel log('Initialized Tacotron model. Dimensions: ') log(' text embedding: %d' % embedded_inputs.shape[-1]) log(' style embedding: %d' % style_embeddings.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False): is_training = linear_targets is not None self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed(speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway") encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \ for idx in range(hp.dec_layer_num)] else: deep_dense = lambda x, dim: \ tf.layers.dense(x, dim, activation=tf.nn.softsign) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unknown multi-speaker model type: {}".format( hp.model_type)) else: speaker_embed = None before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet') # [N, T_in, prenet_depths[-1]=128] encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) # Attention # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) dec_prenet_outputs = DecoderPrenetWrapper( GRUCell(hp.attention_state_size), speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention(hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs) elif hp.attention_type.startswith('ntm2'): shift_width = int(hp.attention_type.split('-')[-1]) attention_mechanism = NTMAttention2(hp.attention_size, encoder_outputs, shift_width=shift_width) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) attention_cell = AttentionWrapper( dec_prenet_outputs, attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. # [N, T_in, attention_size+attention_state_size] concat_cell = ConcatOutputAndAttentionWrapper( attention_cell, embed_to_concat=speaker_embed) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.dec_rnn_size), ResidualWrapper(GRUCell(hp.dec_rnn_size)), ResidualWrapper(GRUCell(hp.dec_rnn_size)), ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception(" [!] Shape {} and {} should be equal". \ format(shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: # [N, T_out, postnet_depth=256] post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, identities=None, id_num=0): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_num = len(symbols2) embedding_text_table = tf.get_variable( 'embedding', [embedding_num, hp.embedding_text_channels], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_text_inputs = tf.nn.embedding_lookup( embedding_text_table, inputs) # [N, T_in, 256] if identities is not None and id_num > 1: embedding_id_table = tf.get_variable( 'embedding_id', [id_num, hp.embedding_id_channels], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_id_inputs = tf.nn.embedding_lookup( embedding_id_table, identities) # [N, 64] embedded_id_inputs = tf.expand_dims(embedded_id_inputs, 1) # [N, 1, 32] embedded_id_inputs = tf.tile(embedded_id_inputs, [1, tf.shape(inputs)[1], 1], name=None) # [N, T_in, 32] embedded_inputs = tf.concat( [embedded_text_inputs, embedded_id_inputs], 2) # [N, T_in, 288] log('multi-speaker') else: embedded_inputs = embedded_text_inputs log('single speaker') # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(GRUCell(256)), ResidualWrapper(GRUCell(256)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: print('training') helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.identities = identities self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log('embedding: %d' % embedded_inputs.shape[-1]) log('prenet out: %d' % prenet_outputs.shape[-1]) log('encoder out: %d' % encoder_outputs.shape[-1]) log('attention out: %d' % attention_cell.output_size) log('concat attn & out: %d' % concat_cell.output_size) log('decoder cell out: %d' % decoder_cell.output_size) log('decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log('decoder out (1 frame): %d' % mel_outputs.shape[-1]) log('postnet out: %d' % post_outputs.shape[-1]) log('linear out: %d' % linear_outputs.shape[-1])
def presentation_transformer(self, inputs, inputs_actual_length): with tf.variable_scope('presentation_layer', reuse=tf.AUTO_REUSE): with tf.name_scope('structure_presentation_layer'): # 正向 fw_cell = GRUCell(num_units=self.hidden_num) fw_drop_cell = DropoutWrapper(fw_cell, output_keep_prob=self.keep_prob) # 反向 bw_cell = GRUCell(num_units=self.hidden_num) bw_drop_cell = DropoutWrapper(bw_cell, output_keep_prob=self.keep_prob) # 动态rnn函数传入的是一个三维张量,[batch_size,n_steps,n_input] 输出是一个元组 每一个元素也是这种形状 if self.is_train and not self.is_extract: output, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_drop_cell, cell_bw=bw_drop_cell, inputs=inputs, sequence_length=inputs_actual_length, dtype=tf.float32) else: output, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_cell, cell_bw=bw_cell, inputs=inputs, sequence_length=inputs_actual_length, dtype=tf.float32) # hiddens的长度为2,其中每一个元素代表一个方向的隐藏状态序列,将每一时刻的输出合并成一个输出 structure_output = tf.concat(output, axis=2) structure_output = self.layer_normalization(structure_output) with tf.name_scope('transformer_layer'): transformer_output = self.encoder_stack( structure_output, self.is_train) with tf.name_scope('global_attention_layer'): w_omega = tf.get_variable( name='w_omega', shape=[self.hidden_num * 2, self.attention_num], initializer=tf.random_normal_initializer()) b_omega = tf.get_variable( name='b_omega', shape=[self.attention_num], initializer=tf.random_normal_initializer()) u_omega = tf.get_variable( name='u_omega', shape=[self.attention_num], initializer=tf.random_normal_initializer()) v = tf.tanh( tf.tensordot(transformer_output, w_omega, axes=1) + b_omega) vu = tf.tensordot(v, u_omega, axes=1, name='vu') # (B,T) shape alphas = tf.nn.softmax(vu, name='alphas') # (B,T) shape # tf.expand_dims用于在指定维度增加一维 global_attention_output = tf.reduce_sum( transformer_output * tf.expand_dims(alphas, -1), 1) return global_attention_output
def initialize(self, text_inputs, input_lengths, speaker_ids, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: text_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. speaker_ids: int32 Tensor containing ids of specific speakers mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference'): is_training = linear_targets is not None batch_size = tf.shape(text_inputs)[0] hp = self._hparams vocab_size = len(symbols) embedded_inputs = embedding(text_inputs, vocab_size, hp.embedding_dim) # [N, T_in, embd_size] # extract speaker embedding if multi-speaker with tf.variable_scope('speaker'): if hp.num_speakers > 1: speaker_embedding = tf.get_variable('speaker_embed', shape=(hp.num_speakers, hp.speaker_embed_dim), dtype=tf.float32) # TODO: what about special initializer=tf.truncated_normal_initializer(stddev=0.5)? speaker_embd = tf.nn.embedding_lookup(speaker_embedding, speaker_ids) else: speaker_embd = None # Encoder prenet_outputs = prenet(inputs=embedded_inputs, drop_rate=hp.drop_rate if is_training else 0.0, is_training=is_training, layer_sizes=hp.encoder_prenet, scope="prenet") # [N, T_in, 128] encoder_outputs = cbhg(prenet_outputs, input_lengths, speaker_embd=speaker_embd, is_training=is_training, K=hp.encoder_cbhg_banks, c=hp.encoder_cbhg_bank_sizes, # [N, T_in, 256] scope='encoder_cbhg') # Attention Mechanism attention_cell = attention_decoder(encoder_outputs, hp.attention_dim, input_lengths, is_training, speaker_embd=speaker_embd, attention_type=hp.attention_type) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(attention_cell, hp.decoder_dim), # 256 ResidualWrapper(GRUCell(hp.decoder_dim)), # 256 ResidualWrapper(GRUCell(hp.decoder_dim)) # 256 ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(text_inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing post_outputs = cbhg(mel_outputs, None, speaker_embd=None, is_training=is_training, K=hp.post_cbhg_banks, c=hp.post_cbhg_bank_sizes + [hp.num_mels], scope='post_cbhg') # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = text_inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.audio = audio.inv_spectrogram_tensorflow(linear_outputs) self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) # TODO: later work around for getting info back? # log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % attention_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def Generator_GRU_CL_VL_TH(n_samples, charmap_len, seq_len=None, gt=None): with tf.variable_scope("Generator"): noise, noise_shape = get_noise() num_neurons = FLAGS.GEN_STATE_SIZE cells = [] for l in range(FLAGS.GEN_GRU_LAYERS): cells.append(GRUCell(num_neurons)) # this is separate to decouple train and test train_initial_states = create_initial_states(noise) inference_initial_states = create_initial_states(noise) sm_weight = tf.Variable( tf.random_uniform([num_neurons, charmap_len], minval=-0.1, maxval=0.1)) sm_bias = tf.Variable( tf.random_uniform([charmap_len], minval=-0.1, maxval=0.1)) embedding = tf.Variable( tf.random_uniform([charmap_len, num_neurons], minval=-0.1, maxval=0.1)) char_input = tf.Variable( tf.random_uniform([num_neurons], minval=-0.1, maxval=0.1)) char_input = tf.reshape(tf.tile(char_input, [n_samples]), [n_samples, 1, num_neurons]) if seq_len is None: seq_len = tf.placeholder(tf.int32, None, name="ground_truth_sequence_length") if gt is not None: # if no GT, we are training train_pred = get_train_op(cells, char_input, charmap_len, embedding, gt, n_samples, num_neurons, seq_len, sm_bias, sm_weight, train_initial_states) inference_op = get_inference_op(cells, char_input, embedding, seq_len, sm_bias, sm_weight, inference_initial_states, num_neurons, charmap_len, reuse=True) else: inference_op = get_inference_op(cells, char_input, embedding, seq_len, sm_bias, sm_weight, inference_initial_states, num_neurons, charmap_len, reuse=False) train_pred = None return train_pred, inference_op
def __init__(self, src_vocab_sz, tgt_vocab_sz, size, batch_size, learn_rate, train=True): """ Constructor for the Seq2SeqModel. Args: src_vocab_size: Number of source vocab tokens. tgt_vocab_size: Number of target vocab tokens. size: Size of each model layer. batch_size: Size of each training batch. learn_rate: Learning rate. train: Whether or not the model is for training. """ self.PAD_ID = 0 self.EOS_ID = 1 self.src_vocab_sz = src_vocab_sz self.tgt_vocab_sz = tgt_vocab_sz self.embed_size = size self.enc_cell = GRUCell(size) self.dec_cell = GRUCell(size * 2) self.train = train # Initialize placeholders self.enc_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name="enc_inputs") self.enc_inputs_len = tf.placeholder(shape=(None, ), dtype=tf.int32, name="enc_inputs_len") self.dec_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name="dec_targets") # Create embedding matrices self.src_embed_matrix = tf.Variable(tf.random_uniform( [self.src_vocab_sz, self.embed_size], 1.0, 1.0), dtype=tf.float32) self.tgt_embed_matrix = tf.Variable(tf.random_uniform( [self.tgt_vocab_sz, self.embed_size], 1.0, 1.0), dtype=tf.float32) # Prepare the encoder self.enc_inputs_embedded = tf.nn.embedding_lookup( self.src_embed_matrix, self.enc_inputs) enc_outputs, enc_output_state = tf.nn.bidirectional_dynamic_rnn( cell_fw=self.enc_cell, cell_bw=self.enc_cell, inputs=self.enc_inputs_embedded, sequence_length=self.enc_inputs_len, dtype=tf.float32, time_major=True) self.enc_outputs = tf.concat(enc_outputs, 2) self.enc_state = tf.concat(enc_output_state, 1) # Prepare the decoder self.enc_max_time, self.batch_sz = tf.unstack(tf.shape( self.enc_inputs)) self.dec_len = self.enc_inputs_len self.W = tf.Variable(tf.random_uniform([size * 2, tgt_vocab_sz], -1, 1), dtype=tf.float32) self.b = tf.Variable(tf.zeros([tgt_vocab_sz]), dtype=tf.float32) self.pad_slice = tf.zeros([self.batch_sz], dtype=tf.int32) self.eos_slice = tf.ones([self.batch_sz], dtype=tf.int32) self.pad_step_embedded = tf.nn.embedding_lookup( self.tgt_embed_matrix, self.pad_slice) self.eos_step_embedded = tf.nn.embedding_lookup( self.tgt_embed_matrix, self.eos_slice) def loop_fn(time, prev_output, prev_state, prev_loop_state): if prev_state == None: elems_finished = (0 >= self.dec_len) _input = self.eos_step_embedded cell_state = self.enc_state return (elems_finished, _input, cell_state, None, None) else: def get_next_input(): out_logits = tf.add(tf.matmul(prev_output, self.W), self.b) pred = tf.argmax(out_logits, axis=1) return tf.nn.embedding_lookup(self.src_embed_matrix, pred) elems_finished = (time >= self.dec_len) finished_cond = tf.reduce_all(elems_finished) _input = tf.cond(finished_cond, lambda: self.pad_step_embedded, get_next_input) cell_state = prev_state output = prev_output loop_state = None return (elems_finished, _input, cell_state, output, loop_state) self.loop_function = loop_fn dec_outputs_ta, dec_state, _ = tf.nn.raw_rnn(self.dec_cell, loop_fn) self.dec_outputs = dec_outputs_ta.stack() self.dec_state = dec_state dec_max_time, dec_batch_sz, dec_dim = tf.unstack( tf.shape(self.dec_outputs)) dec_outputs_flat = tf.reshape(self.dec_outputs, (-1, dec_dim)) dec_logits_flat = tf.add(tf.matmul(dec_outputs_flat, self.W), self.b) self.dec_logits = tf.reshape( dec_logits_flat, (dec_max_time, dec_batch_sz, self.tgt_vocab_sz)) self.dec_prediction = tf.argmax(self.dec_logits, 2) # Prepare the optimizer if training if self.train: stepwise_crossent = tf.nn.softmax_cross_entropy_with_logits( labels=tf.one_hot(self.dec_targets, depth=self.tgt_vocab_sz, dtype=tf.float32), logits=self.dec_logits) self.loss = tf.reduce_mean(stepwise_crossent) self.opt = tf.train.AdamOptimizer(learn_rate).minimize(self.loss) self.saver = tf.train.Saver(tf.global_variables())
batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='batch_ph') target_ph = tf.placeholder(tf.float32, [None], name='target_ph') seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph') keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph') # Embedding layer with tf.name_scope('Embedding_layer'): embeddings_var = tf.Variable(tf.random_uniform( [vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True) tf.summary.histogram('embeddings_var', embeddings_var) batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph) # (Bi-)RNN layer(-s) rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32) tf.summary.histogram('RNN_outputs', rnn_outputs) # rnn_outputs_shape: [fw_cell, bw_cell], 其中fw_cell.shape = bw_cell.shape = [batch_size, seq_len, n_hidden] # Attention layer with tf.name_scope('Attention_layer'): attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True) tf.summary.histogram('alphas', alphas) # Dropout
def __init__(self, embedding_size, init_embed, hidden_size, \ attention_size, max_sent_len, keep_prob, just_embed = True): # training inputs self.input_x = tf.placeholder(tf.int32, [None, None, max_sent_len], name="input_x") self.sequence_length = tf.placeholder(tf.int32, [None, None], name="input_len") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.input_em = tf.placeholder(tf.int32, [None, max_sent_len], name="input_x_em") self.sequence_len_em = tf.placeholder(tf.int32, [None], name="input_len_em") self.is_train = tf.placeholder(tf.int32, (), name="is_train") with tf.variable_scope('siamese_discriminator'): # embedding layer with initialization batch_size = tf.shape(self.input_x)[1] num_classes = tf.shape(self.input_x)[0] with tf.name_scope("pair_inps"): self.input, self.sequence_len, self.labels = self.all_class_flattener( self.input_x, self.sequence_length, self.is_train) with tf.name_scope("flatten_input"): self.inter_inp = self.merge_sents(self.input) self.inner_lens = tf.reshape(self.sequence_len, [num_classes * batch_size * 4]) with tf.name_scope("embedding"): # trainable embedding W = tf.Variable(init_embed, name="W", dtype=tf.float32) self.embedded_chars = tf.nn.embedding_lookup(W, self.inter_inp) self.embedded_chars_em = tf.nn.embedding_lookup( W, self.input_em) # RNN layer + attention with tf.name_scope("bi-rnn"): self.gru1 = GRUCell(hidden_size) self.gru2 = GRUCell(hidden_size) rnn_outputs, _ = bi_rnn(self.gru1, self.gru2 ,\ inputs=self.embedded_chars, sequence_length=self.inner_lens, \ dtype=tf.float32) rnn_outputs_em, _ = bi_rnn(self.gru1, self.gru2 ,\ inputs=self.embedded_chars_em, sequence_length=self.sequence_len_em , \ dtype=tf.float32) self.attention_outputs, self.alphas = attention( rnn_outputs, attention_size, return_alphas=True) self.attention_outputs_em, self.alphas_em = attention( rnn_outputs_em, attention_size, return_alphas=True) self.output_em = tf.reduce_mean(self.attention_outputs_em, axis=0) drop_outputs = tf.nn.dropout(self.attention_outputs, keep_prob) with tf.name_scope('flattener'): self.drop_outputs = tf.reshape( drop_outputs, (num_classes * batch_size * 2, 2, -1)) #b,2,d with tf.name_scope('similarity_measure'): # self.d1 = d1 = self.distance(self.drop_outputs[:, 0], self.drop_outputs[:, 1]) loss = self.labels * tf.square(d1) + ( 1 - self.labels) * tf.square(tf.maximum((1 - d1), 0)) self.loss = tf.div(tf.reduce_mean(loss), 2) with tf.name_scope("accuracy"): self.temp_sim = tf.subtract( tf.ones_like(self.d1), tf.rint(self.d1), name="temp_sim") #auto threshold 0.5 correct_predictions = tf.equal(self.temp_sim, self.labels) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") self.params = [ param for param in tf.trainable_variables() if 'siamese_discriminator' in param.name ] for param in self.params: print(param.name) sd_optimizer = tf.train.AdamOptimizer(1e-4) grads_and_vars = sd_optimizer.compute_gradients(self.loss, self.params, aggregation_method=2) self.train_op = sd_optimizer.apply_gradients(grads_and_vars)
def rnn_layers(x, seq_length, training, hidden_num=100, layer_num=3, class_n=5, cell='LSTM', dtype=tf.float32): """Generate RNN layers. Args: x (Float): A 3D-Tensor of shape [batch_size,max_time,channel] seq_length (Int): A 1D-Tensor of shape [batch_size], real length of each sequence. training (Boolean): A 0D-Tenosr indicate if it's in training. hidden_num (int, optional): Defaults to 100. Size of the hidden state, hidden unit will be deep concatenated, so the final hidden state will be size of 200. layer_num (int, optional): Defaults to 3. Number of layers in RNN. class_n (int, optional): Defaults to 5. Number of output class. cell(str): A String from 'LSTM','GRU','BNLSTM', the RNN Cell used. BNLSTM stand for Batch normalization LSTM Cell. Returns: logits: A 3D Tensor of shape [batch_size, max_time, class_n] """ cells_fw = list() cells_bw = list() for i in range(layer_num): if cell == 'LSTM': cell_fw = LSTMCell(hidden_num) cell_bw = LSTMCell(hidden_num) elif cell == 'GRU': cell_fw = GRUCell(hidden_num) cell_bw = GRUCell(hidden_num) elif cell == 'BNLSTM': cell_fw = BNLSTMCell(hidden_num, training=training) cell_bw = BNLSTMCell(hidden_num, training=training) else: raise ValueError("Cell type unrecognized.") cells_fw.append(cell_fw) cells_bw.append(cell_bw) multi_cells_fw = tf.nn.rnn_cell.MultiRNNCell(cells_fw) multi_cells_bw = tf.nn.rnn_cell.MultiRNNCell(cells_bw) with tf.variable_scope('BDGRU_rnn') as scope: outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=multi_cells_fw, cell_bw=multi_cells_bw, inputs=x, sequence_length=seq_length, dtype=dtype, scope=scope) lasth = tf.concat(outputs, 2, name='birnn_output_concat') # shape of lasth [batch_size,max_time,hidden_num*2] batch_size = tf.shape(lasth)[0] max_time = lasth.get_shape().as_list()[1] with tf.variable_scope('rnn_fnn_layer'): weight_out = _variable_on_cpu( name='weights', shape=[2, hidden_num], initializer=tf.truncated_normal_initializer( stddev=np.sqrt(2.0 / (2 * hidden_num))), dtype=dtype) biases_out = _variable_on_cpu(name='bias', shape=[hidden_num], initializer=tf.zeros_initializer(), dtype=dtype) weight_class = _variable_on_cpu( name='weights_class', shape=[hidden_num, class_n], initializer=tf.truncated_normal_initializer( stddev=np.sqrt(2.0 / hidden_num)), dtype=dtype) bias_class = _variable_on_cpu(name='bias_class', shape=[class_n], initializer=tf.zeros_initializer(), dtype=dtype) lasth_rs = tf.reshape(lasth, [batch_size, max_time, 2, hidden_num], name='lasth_rs') lasth_output = tf.nn.bias_add(tf.reduce_sum(tf.multiply( lasth_rs, weight_out), axis=2), biases_out, name='lasth_bias_add') lasth_output_rs = tf.reshape(lasth_output, [batch_size * max_time, hidden_num], name='lasto_rs') logits = tf.reshape(tf.nn.bias_add( tf.matmul(lasth_output_rs, weight_class), bias_class), [batch_size, max_time, class_n], name="rnn_logits_rs") return logits
def CBHG( input_Pattern, input_Length, scope, is_Training, conv_Bank_Filter_Count=128, conv_Bank_Max_Kernal_Size=16, max_Pooling_Size=2, conv_Projection_Filter_Count_and_Kernal_Size_List=[(128, 3), (128, 3)], highway_Layer_Count=4, gru_Cell_Size=128, ): with tf.variable_scope(scope): with tf.variable_scope('conv_Bank'): #Convolution Bank bank_Layer_List = [] for kernel_Size in range(1, conv_Bank_Max_Kernal_Size + 1): bank_Layer = Conv1D(input_Pattern, filter_Count=conv_Bank_Filter_Count, kernel_Size=kernel_Size, activation=tf.nn.relu, scope="conv1D_%d" % kernel_Size, is_Training=is_Training) bank_Layer_List.append(bank_Layer) conv_Bank_Activation = tf.concat(bank_Layer_List, axis=-1) #Max pooling max_Pooling_Activation = tf.layers.max_pooling1d( conv_Bank_Activation, pool_size=max_Pooling_Size, strides=1, padding='same') #Convolution Projections conv_Projection_Activation = max_Pooling_Activation for index, (filter_Count, kernel_Size) in enumerate( conv_Projection_Filter_Count_and_Kernal_Size_List): conv_Projection_Activation = Conv1D(conv_Projection_Activation, filter_Count=filter_Count, kernel_Size=kernel_Size, activation=tf.nn.relu, scope="projection_%d" % index, is_Training=is_Training) #Residual residual_Activation = conv_Projection_Activation + input_Pattern #Cell size correction -> But I am not sure why this code is located before the highway. correlected_Residual_Activation = tf.layers.dense( residual_Activation, units=gru_Cell_Size, activation=None, use_bias=True, name="size_Correction") #Highways highway_Activation = correlected_Residual_Activation for index in range(highway_Layer_Count): highway_Activation = Highway_Net(input_Pattern=highway_Activation, scope="highway_%d" % index) #Bidirectional GRU output_Pattern_List, rnn_State_List = tf.nn.bidirectional_dynamic_rnn( cell_fw=GRUCell(gru_Cell_Size), cell_bw=GRUCell(gru_Cell_Size), inputs=highway_Activation, sequence_length=input_Length, dtype=tf.float32) return tf.concat(output_Pattern_List, axis=2)
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) JA = config.max_answer_length JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): # Char-CNN Embedding if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) # Word Embedding if config.use_word_emb: with tf.variable_scope("emb_var") as scope, tf.device( "/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') tf.get_variable_scope().reuse_variables() self.word_emb_scope = scope if config.use_glove_for_unk: word_emb_mat = tf.concat( [word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq # Concat Char-CNN Embedding and Word Embedding if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # exact match if config.use_exact_match: emx = tf.expand_dims(tf.cast(self.emx, tf.float32), -1) xx = tf.concat([xx, emx], 3) # [N, M, JX, di+1] emq = tf.expand_dims(tf.cast(self.emq, tf.float32), -1) qq = tf.concat([qq, emq], 2) # [N, JQ, di+1] # 2 layer highway network on Concat Embedding if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # Bidirection-LSTM (3rd layer on paper) cell = GRUCell(d) if config.GRU else BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), _ = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat([fw_u, bw_u], 2) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h # Attention Flow Layer (4th layer on paper) with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, size=d, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell # Modeling layer (5th layer on paper) tp0 = p0 for layer_idx in range(config.LSTM_num_layers - 1): (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope="g_{}".format(layer_idx)) # [N, M, JX, 2d] p0 = tf.concat([fw_g0, bw_g0], 3) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) # [N, M, JX, 2d] # Self match layer with tf.variable_scope("SelfMatch"): s0 = tf.reshape(g1, [N * M, JX, 2 * d]) # [N * M, JX, 2d] x_mask = tf.reshape(self.x_mask, [N * M, JX]) first_cell = AttentionCell(cell, s0, size=d, mask=x_mask, is_train=self.is_train) (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn( first_cell, first_cell, s0, x_len, dtype='float', scope='s') # [N, M, JX, 2d] s1 = tf.concat([fw_s, bw_s], 2) # [N * M, JX, 2d], M == 1 # prepare for PtrNet encoder_output = tf.expand_dims(s1, 1) # [N, M, JX, 2d] encoder_output = tf.expand_dims( tf.cast(self.x_mask, tf.float32), -1) * encoder_output # [N, M, JX, 2d] if config.GRU: encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: if isinstance(fw_s_f, LSTMStateTuple): encoder_state_c = tf.concat((fw_s_f.c, bw_s_f.c), 1, name='encoder_concat_c') encoder_state_h = tf.concat((fw_s_f.h, bw_s_f.h), 1, name='encoder_concat_h') encoder_state_final = LSTMStateTuple(c=encoder_state_c, h=encoder_state_h) elif isinstance(fw_s_f, tf.Tensor): encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: encoder_state_final = None tf.logging.error("encoder_state_final not set") print("encoder_state_final:", encoder_state_final) with tf.variable_scope("output"): # eos_symbol = config.eos_symbol # next_symbol = config.next_symbol tf.assert_equal( M, 1) # currently dynamic M is not supported, thus we assume M==1 answer_string = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.int32, name='answer_string') # [N, M, JA + 1] answer_string_mask = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.bool, name='answer_string_mask') # [N, M, JA + 1] answer_string_length = tf.placeholder( shape=(N, 1), dtype=tf.int32, name='answer_string_length', ) # [N, M] self.tensor_dict['answer_string'] = answer_string self.tensor_dict['answer_string_mask'] = answer_string_mask self.tensor_dict['answer_string_length'] = answer_string_length self.answer_string = answer_string self.answer_string_mask = answer_string_mask self.answer_string_length = answer_string_length answer_string_flattened = tf.reshape(answer_string, [N * M, JA + 1]) self.answer_string_flattened = answer_string_flattened # [N * M, JA+1] print("answer_string_flattened:", answer_string_flattened) answer_string_length_flattened = tf.reshape( answer_string_length, [N * M]) self.answer_string_length_flattened = answer_string_length_flattened # [N * M] print("answer_string_length_flattened:", answer_string_length_flattened) decoder_cell = GRUCell(2 * d) if config.GRU else BasicLSTMCell( 2 * d, state_is_tuple=True) with tf.variable_scope("Decoder"): decoder_train_logits = ptr_decoder( decoder_cell, tf.reshape(tp0, [N * M, JX, 2 * d]), # [N * M, JX, 2d] tf.reshape(encoder_output, [N * M, JX, 2 * d]), # [N * M, JX, 2d] encoder_final_state=encoder_state_final, max_encoder_length=config.sent_size_th, decoder_output_length= answer_string_length_flattened, # [N * M] batch_size=N, # N * M (M=1) attention_proj_dim=self.config.decoder_proj_dim, scope='ptr_decoder' ) # [batch_size, dec_len*, enc_seq_len + 1] self.decoder_train_logits = decoder_train_logits print("decoder_train_logits:", decoder_train_logits) self.decoder_train_softmax = tf.nn.softmax( self.decoder_train_logits) self.decoder_inference = tf.argmax( decoder_train_logits, axis=2, name='decoder_inference') # [N, JA + 1] self.yp = tf.ones([N, M, JX], dtype=tf.int32) * -1 self.yp2 = tf.ones([N, M, JX], dtype=tf.int32) * -1
trainable=False) with tf.name_scope("word_embedding"): embeddings_eng = tf.get_variable( "embeddings_eng", [voc_size_eng, SIZE_EMBED_DIM]) embed_enc = tf.nn.embedding_lookup(embeddings_eng, enc_input, name="embed_enc") embeddings_kor = tf.get_variable( "embeddings_kor", [voc_size_kor, SIZE_EMBED_DIM]) embed_dec = tf.nn.embedding_lookup(embeddings_kor, dec_input, name="embed_dec") with tf.variable_scope("encoder_layer"): output_enc, state_enc = bi_rnn(GRUCell(SIZE_RNN_STATE), GRUCell(SIZE_RNN_STATE), inputs=embed_enc, sequence_length=enc_seq_len, dtype=tf.float32) state_enc_last = tf.concat([state_enc[0], state_enc[1]], axis=1) # [batch, state*2] output_enc = tf.concat(output_enc, axis=2) # [batch, max_eng, state*2] output_enc = tf.nn.dropout(output_enc, keep_prob=keep_prob, name="output_enc") assert output_enc.get_shape()[2] == SIZE_BiRNN_STATE assert state_enc_last.get_shape()[1] == SIZE_BiRNN_STATE
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training2 = linear_targets is not None # test에서 이게 True로 되는데, 이게 의도한 것인가??? is_training = not rnn_decoder_test_mode self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: # speaker_embedding_size = f(16) speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed( speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway" ) # 'enc_prenet_sizes': [f(256), f(128)] encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [ get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) for idx in range(hp.dec_layer_num) ] else: deep_dense = lambda x, dim: tf.layers.dense( x, dim, activation=tf.nn.softsign ) # softsign: x / (abs(x) + 1) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다. before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type)) else: # self.num_speakers =1인 경우 speaker_embed = None before_highway = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet' ) # 'enc_prenet_sizes': [f(256), f(128)], dropout_prob = 0.5 # ==> (N, T_in, 128) # enc_rnn_size = 128 encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) self.hccho = encoder_outputs ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) # single: attention_size = 128 if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_mon_norm_hccho': attention_mechanism = BahdanauMonotonicAttention_hccho( hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention(hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs) elif hp.attention_type.startswith('ntm2'): shift_width = int(hp.attention_type.split('-')[-1]) attention_mechanism = NTMAttention2(hp.attention_size, encoder_outputs, shift_width=shift_width) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다. # carpedm20은 tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만, keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다. attention_cell = AttentionWrapper( GRUCell(hp.attention_state_size), attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 dec_prenet_outputs = DecoderPrenetWrapper( attention_cell, speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) # dec_prenet_sizes = [f(256), f(128)] # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다. # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ] concat_cell = ConcatOutputAndAttentionWrapper( dec_prenet_outputs, embed_to_concat=speaker_embed ) # concat(output,attention,speaker_embed)해서 새로운 output을 만든다. # Decoder (layers specified bottom to top): dec_rnn_size= 256 cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size) ] # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데... for _ in range(hp.dec_layer_num): # hp.dec_layer_num = 2 cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor ) # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까??? (hp.num_mels+1) * hp.reduction_factor decoder_init_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. ) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training2: # rnn_decoder_test_mode = True if test mode, train mode에서는 False helper = TacoTrainingHelper( inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters) # max_iters=200 # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다. ==> final_decoder_state[0] alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
# Different placeholders with tf.name_scope('Input'): batch_ph = tf.placeholder(tf.int32,[None,SEQUENCE_LENGTH],name='batch_ph') target_ph = tf.placeholder(tf.float32,[None],name='target_ph') seq_len_ph = tf.placeholder(tf.int32,[None],name='seq_len_ph') keep_prob_ph = tf.placeholder(tf.float32,name='keep_prob_ph') # Embedding layer with tf.name_scope('Embedding_layer'): embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size,EMBEDDING_DIM],-1.0,1.0),trainable=True) tf.summary.histogram('embeddings_var',embeddings_var) batch_embedded = tf.nn.embedding_lookup(embeddings_var,batch_ph) # (Bi-)RNN layers(-s) rnn_outputs,_ = bi_rnn(GRUCell(HIDDEN_SIZE),GRUCell(HIDDEN_SIZE),inputs=batch_embedded, sequence_length=seq_len_ph,dtype=tf.float32) tf.summary.histogram('RNN_outputs',rnn_outputs) # Attention layer with tf.name_scope('Attention_layer'): attention_output,alphas = attention.attention(rnn_outputs,ATTENTION_SIZE,return_alphas=True) tf.summary.histogram('alphas',alphas) # Dropout drop = tf.nn.dropout(attention_output,keep_prob_ph) # Fully connected layer with tf.name_scope('Fully_connected_layer'): # Hidden size is multiplied by 2 for Bi-rnn W = tf.Variable(tf.truncated_normal([HIDDEN_SIZE*2,1],stddev=0.1))
# Different placeholders with tf.name_scope('Input_layer'): input_x = tf.placeholder(tf.int32, [None, maxlen], name='input_x') output_y = tf.placeholder(tf.float32, [None], name='output_y') keep_prob = tf.placeholder(tf.float32, name='keep_prob') # Embedding layer with tf.name_scope('Embedding_layer'): embeddings_var = tf.Variable(tf.random_uniform( [len(word_index) + 1, embedding_dim], -1.0, 1.0), trainable=True) tf.summary.histogram('embeddings_var', embeddings_var) batch_embedded = tf.nn.embedding_lookup(embeddings_var, input_x) # BiDirectional RNN Layer rnn_outputs, _ = bi_rnn(GRUCell(hidden_size), GRUCell(hidden_size), inputs=batch_embedded, dtype=tf.float32) tf.summary.histogram('RNN_outputs', rnn_outputs) # Attention layer with tf.name_scope('Attention_layer'): attention_output, alphas = attention(rnn_outputs, attention_size, return_alphas=True) tf.summary.histogram('alphas', alphas) # Dropout for attention layer drop = tf.nn.dropout(attention_output, keep_prob)
def build_graph(self): """ Build the main architecture of the graph. """ random.seed(310) tf.set_random_seed(902) print("building graph") with tf.variable_scope('model', reuse=self.reuse): ### Lookup ELMo Embedding ### self.x_elmo = layers.Lambda( lambda inputs: ElmoEmbedding(inputs, elmo_model), output_shape=(1024, ))(self.x_elmo_input) shape = tf.shape(self.x_elmo) self.shape = shape # self.glove = tf.Variable(tf.random_uniform([tf.shape(self.glove)[0], self.embed_dimensions], -1.0, 1.0),trainable=True) if self.glove_include: ### Lookup Glove Vectors ### batch_embedded = tf.nn.embedding_lookup(self.glove, self.x) batch_embedded = batch_embedded[:, -shape[1]:, :] ### Include POS ### if self.pos_include: ### POS-TAG Embedding ### embeddings_var = tf.Variable(tf.random_uniform( [12, self.pos_dimensions], -1.0, 1.0), trainable=True) self.pos_embedding = tf.nn.embedding_lookup( embeddings_var, self.pos) self.pos_embedded = self.pos_embedding[:, -shape[1]:, :] batch_embedded = tf.concat( [batch_embedded, self.pos_embedded], axis=2) if self.layer_1_include: hid = 2 * self.hidden_size if self.layer_1 == 'lstm': rnn_outputs, _ = bi_rnn( LSTMCell(self.hidden_size, use_peepholes=self.peephole_1), LSTMCell(self.hidden_size, use_peepholes=self.peephole_2), inputs=batch_embedded, dtype=tf.float32, scope='rnn_1') fw_outputs, bw_outputs = rnn_outputs layer = tf.concat([fw_outputs, bw_outputs], axis=2) elif self.layer_1 == 'gru': rnn_outputs, _ = bi_rnn(GRUCell(self.hidden_size), GRUCell(self.hidden_size), inputs=batch_embedded, dtype=tf.float32, scope='rnn_1') fw_outputs, bw_outputs = rnn_outputs layer = tf.concat([fw_outputs, bw_outputs], axis=2) else: conv_layer = tf.layers.conv1d( inputs=batch_embedded, filters=self.hidden_size * 2, kernel_size=self.kernel_size, strides=1, padding="same", activation=tf.nn.relu) layer = conv_layer else: layer = batch_embedded hid = self.hidden_size if self.pos_include: hid += self.pos_dimensions print(self.hidden_size) # FLAGS Including ELMO and Glove if self.glove_include and self.elmo: H_1 = tf.concat([layer, self.x_elmo], axis=2) hid += 1024 elif self.glove_include: H_1 = layer elif self.elmo: H_1 = self.x_elmo hid = 1024 if self.layer_2 == 'lstm': rnn_outputs_2, _ = bi_rnn( LSTMCell(hid, use_peepholes=self.peephole_3), LSTMCell(hid, use_peepholes=self.peephole_4), inputs=H_1, dtype=tf.float32, scope='rnn_2') fw_outputs_2, bw_outputs_2 = rnn_outputs_2 H = tf.concat([fw_outputs_2, bw_outputs_2], axis=2) elif self.layer_2 == 'gru': rnn_outputs_2, _ = bi_rnn(GRUCell(hid), GRUCell(hid), inputs=H_1, dtype=tf.float32, scope='rnn_2') fw_outputs_2, bw_outputs_2 = rnn_outputs_2 H = tf.concat([fw_outputs_2, bw_outputs_2], axis=2) elif self.layer_2 == 'conv': conv_layer = tf.layers.conv1d(inputs=H_1, filters=hid, kernel_size=self.kernel_size, strides=1, padding="same", activation=tf.nn.relu) H = conv_layer hid = tf.cast(hid / 2, tf.int32) else: H = H_1 hid = tf.cast(hid / 2, tf.int32) hid *= 2 ### Ask whether there is a sequence with length 0 ### condition = tf.equal(tf.reduce_min(self.seq_len), 0) ### FLAG Including attention ### if self.attention: with tf.variable_scope('attention', reuse=self.reuse): M = tf.tanh( H) # M = tanh(H) (batch_size, seq_len, HIDDEN_SIZE) dropout_layer_attention = tf.layers.dropout( inputs=tf.reshape(M, [-1, hid]), rate=self.attention_prob, training=self.is_training, seed=847) self.dense = tf.layers.dense( inputs=dropout_layer_attention, units=self.num_attention, use_bias=False) ### Pool - Max or Mean ### if self.pool_mean: self.pool = tf.reduce_mean(self.dense, axis=1) else: self.pool = tf.reduce_max(self.dense, axis=1) ### Setting for stride 2 ### #self.alpha = tf.exp(tf.reshape(self.pool, # [-1, tf.cast(tf.round(tf.add(tf.div(tf.cast(shape[1], dtype = tf.float32), 2.0), 0.1)), # dtype = tf.int32)])) self.alpha = tf.exp(tf.reshape(self.pool, [-1, shape[1]])) ### Masking the sequences ### if self.mask: with tf.variable_scope('mask', reuse=self.reuse): self.alpha = tf.reverse(self.alpha, axis=[1]) mask = tf.sequence_mask(self.seq_len) mask = tf.to_float(mask) self.alpha = tf.cond(condition, lambda: self.alpha, lambda: self.alpha * mask) self.alpha = tf.reverse(self.alpha, axis=[1]) #### Softmax #### self.alpha = self.alpha / tf.expand_dims( tf.reduce_sum(self.alpha, axis=1), 1) ### Derive the word with the highest attention ### pos = tf.argmax(self.alpha, axis=1) sparse_tensor = tf.string_split(self.x_elmo_input) dense_tensor = tf.sparse_tensor_to_dense(sparse_tensor, '') rg = tf.range(0, shape[0]) indices = tf.transpose([rg, tf.cast(pos, tf.int32)], [1, 0]) self.best_example = tf.gather_nd(dense_tensor, indices) ### Computing weighted average ### # r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, # [-1, tf.cast(tf.round(tf.add( # tf.div(tf.cast(shape[1], dtype=tf.float32), # 2.0), 0.1)), # dtype=tf.int32), 1])) r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, shape[1], 1])) r = tf.squeeze(r, axis=2) else: with tf.variable_scope('rnn_average', reuse=self.reuse): ### Take a simple mean of all the words (INCLUDING padding) ### ### Masking the sequences ### if self.mask: with tf.variable_scope('mask', reuse=self.reuse): self.alpha = tf.cond( condition, lambda: tf.tile(tf.expand_dims(shape[1], 0), tf.expand_dims(shape[0], 0)), lambda: self.seq_len) self.alpha = tf.reciprocal(tf.to_float(self.alpha)) self.alpha = tf.tile(tf.expand_dims(self.alpha, 1), [1, shape[1]]) self.alpha = tf.reverse(self.alpha, axis=[1]) mask = tf.sequence_mask(self.seq_len) mask = tf.to_float(mask) self.alpha = tf.cond(condition, lambda: self.alpha, lambda: self.alpha * mask) self.alpha = tf.reverse(self.alpha, axis=[1]) else: self.alpha = tf.tile(tf.expand_dims(shape[1], 0), tf.expand_dims(shape[0], 0)) self.alpha = tf.reciprocal(tf.to_float(self.alpha)) self.alpha = tf.tile(tf.expand_dims(self.alpha, 1), [1, shape[1]]) ### Necessarily here but serves no purpose - Derive the word with the highest attention ### pos = tf.argmax(self.alpha, axis=1) sparse_tensor = tf.string_split(self.x_elmo_input) dense_tensor = tf.sparse_tensor_to_dense(sparse_tensor, '') rg = tf.range(0, shape[0]) indices = tf.transpose([rg, tf.cast(pos, tf.int32)], [1, 0]) self.best_example = tf.gather_nd(dense_tensor, indices) ### Computing average ### r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, shape[1], 1])) r = tf.squeeze(r, axis=2) self.h_star = tf.tanh(r) # (batch , HIDDEN_SIZE)
def define_sequence_model(self): seed = 12345 np.random.seed(12345) layer_list = [] with self.graph.as_default() as g: utt_length = tf.placeholder(tf.int32, shape=(None)) g.add_to_collection(name="utt_length", value=utt_length) with tf.name_scope("input"): input_layer = tf.placeholder(dtype=tf.float32, shape=(None, None, self.n_in), name="input_layer") if self.dropout_rate != 0.0: print "Using dropout to avoid overfitting and the dropout rate is", self.dropout_rate is_training_drop = tf.placeholder(dtype=tf.bool, shape=(), name="is_training_drop") input_layer_drop = dropout(input_layer, self.dropout_rate, is_training=is_training_drop) layer_list.append(input_layer_drop) g.add_to_collection(name="is_training_drop", value=is_training_drop) else: layer_list.append(input_layer) g.add_to_collection("input_layer", layer_list[0]) with tf.name_scope("hidden_layer"): basic_cell = [] if "tanh" in self.hidden_layer_type: is_training_batch = tf.placeholder( dtype=tf.bool, shape=(), name="is_training_batch") bn_params = { "is_training": is_training_batch, "decay": 0.99, "updates_collections": None } g.add_to_collection("is_training_batch", is_training_batch) for i in xrange(len(self.hidden_layer_type)): if self.dropout_rate != 0.0: if self.hidden_layer_type[i] == "tanh": new_layer = fully_connected( layer_list[-1], self.hidden_layer_size[i], activation_fn=tf.nn.tanh, normalizer_fn=batch_norm, normalizer_params=bn_params) new_layer_drop = dropout( new_layer, self.dropout_rate, is_training=is_training_drop) layer_list.append(new_layer_drop) if self.hidden_layer_type[i] == "lstm": basic_cell.append( MyDropoutWrapper(BasicLSTMCell( num_units=self.hidden_layer_size[i]), self.dropout_rate, self.dropout_rate, is_training=is_training_drop)) if self.hidden_layer_type[i] == "gru": basic_cell.append( MyDropoutWrapper(GRUCell( num_units=self.hidden_layer_size[i]), self.dropout_rate, self.dropout_rate, is_training=is_training_drop)) else: if self.hidden_layer_type[i] == "tanh": new_layer = fully_connected( layer_list[-1], self.hidden_layer_size[i], activation_fn=tf.nn.tanh, normalizer_fn=batch_norm, normalizer_params=bn_params) layer_list.append(new_layer) if self.hidden_layer_type[i] == "lstm": basic_cell.append( LayerNormBasicLSTMCell( num_units=self.hidden_layer_size[i])) if self.hidden_layer_type[i] == "gru": basic_cell.append( LayerNormGRUCell( num_units=self.hidden_layer_size[i])) multi_cell = MultiRNNCell(basic_cell) rnn_outputs, rnn_states = tf.nn.dynamic_rnn( multi_cell, layer_list[-1], dtype=tf.float32, sequence_length=utt_length) layer_list.append(rnn_outputs) with tf.name_scope("output_layer"): if self.output_type == "linear": output_layer = tf.layers.dense(rnn_outputs, self.n_out) # stacked_rnn_outputs=tf.reshape(rnn_outputs,[-1,self.n_out]) # stacked_outputs=tf.layers.dense(stacked_rnn_outputs,self.n_out) # output_layer=tf.reshape(stacked_outputs,[-1,utt_length,self.n_out]) g.add_to_collection(name="output_layer", value=output_layer) with tf.name_scope("training_op"): if self.optimizer == "adam": self.training_op = tf.train.AdamOptimizer()
def _build_graph(self): now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print(now) print("Build Graph...") print() self.xavier_init = tf.contrib.layers.xavier_initializer() self.embed_dim = 100 self.state_dim = 100 self.bi_state_dim = self.state_dim * 2 self.attend_dim = 250 self.feat_dim = self.bi_state_dim self.fc_dim = 150 print("embed_dim : %d" % self.embed_dim) print("state_dim : %d" % self.state_dim) print("bi_state_dim : %d" % self.bi_state_dim) print("attend_dim : %d" % self.attend_dim) print("feat_dim : %d" % self.feat_dim) print("fc_dim : %d" % self.fc_dim) print() with tf.device(self.dev): with tf.variable_scope("input_placeholders"): self.enc_input = tf.placeholder(tf.int32, shape=[None, None], name="enc_input") self.enc_seq_len = tf.placeholder(tf.int32, shape=[ None, ], name="enc_seq_len") self.targets = tf.placeholder(tf.int32, shape=[ None, ], name="targets") self.batch_size = tf.placeholder(tf.int32, shape=[], name="batch_size") self.keep_prob = tf.placeholder(tf.float32, name="keep_prob") with tf.variable_scope("words_embedding"): self.embeddings = tf.get_variable( "embeddings", [self.voc_size, self.embed_dim], initializer=self.xavier_init) self.embed_in = tf.nn.embedding_lookup(self.embeddings, self.enc_input, name="embed_in") self.pad_mask = tf.sequence_mask(self.enc_seq_len, self.input_len_max, dtype=tf.float32, name="pad_mask1") with tf.variable_scope("rnn_encoder_layer"): self.output_enc, self.state_enc = bi_rnn( GRUCell(self.state_dim), GRUCell(self.state_dim), inputs=self.embed_in, sequence_length=self.enc_seq_len, dtype=tf.float32) self.state_enc = tf.concat( [self.state_enc[0], self.state_enc[1]], axis=1, name="state_enc1") assert self.state_enc.get_shape()[1] == self.bi_state_dim self.output_enc = tf.concat( self.output_enc, axis=2) # [batch, max_eng, state*2] self.output_enc = tf.nn.dropout(self.output_enc, keep_prob=self.keep_prob, name="output_enc1") print("output_enc.get_shape() : %s" % (self.output_enc.get_shape())) assert self.output_enc.get_shape()[2] == self.bi_state_dim with tf.variable_scope("attention_layer"): self.rows = 30 self.W_s1 = tf.get_variable( "W_s1", [1, 1, self.feat_dim, self.attend_dim], initializer=self.xavier_init) self.bias_s1 = tf.get_variable("bias_s1", [self.attend_dim]) self.W_s2 = tf.get_variable("W_s2", [self.attend_dim, self.rows], initializer=self.xavier_init) self.identity = tf.reshape( tf.tile(tf.diag(tf.ones(self.rows)), [self.batch_size, 1]), [self.batch_size, self.rows, self.rows], name="identity") self.output_enc_ex = tf.reshape( self.output_enc, [-1, self.input_len_max, 1, self.feat_dim]) self.context_att = tf.nn.conv2d(self.output_enc_ex, self.W_s1, strides=[1, 1, 1, 1], padding="SAME") self.context_att = tf.tanh(tf.nn.bias_add( self.context_att, self.bias_s1), name="context_att") print("context_att.get_shape() : %s" % (self.context_att.get_shape())) # attention self.attention_tot = tf.matmul( tf.reshape(self.context_att, [-1, self.attend_dim]), self.W_s2) self.attention_tot = tf.reshape( self.attention_tot, [-1, self.input_len_max, self.rows]) self.attention_tot = tf.nn.softmax( self.attention_tot, dim=1) * tf.reshape( self.pad_mask, [-1, self.input_len_max, 1]) self.attention_tot = tf.nn.softmax(self.attention_tot, dim=1) print("attention_tot.get_shape() : %s" % (self.attention_tot.get_shape())) self.attention = tf.reduce_sum(self.attention_tot, axis=2) self.attention = tf.reshape( self.attention, [self.batch_size, self.input_len_max]) * self.pad_mask self.attention = tf.nn.softmax(self.attention) print("attention.get_shape() : %s" % (self.attention.get_shape())) self.attention_tot_T = tf.transpose(self.attention_tot, [0, 2, 1], name="attention_tot_T") self.AA_t = tf.matmul(self.attention_tot_T, self.attention_tot) - self.identity print("AA_t.get_shape() : %s" % (self.AA_t.get_shape())) # penalty self.P = tf.square(tf.norm(self.AA_t, axis=[-2, -1], ord="fro")) self.P = tf.reduce_mean(self.P, name="P") # context.. self.context = tf.reduce_sum( self.output_enc * tf.reshape(self.attention, [-1, self.input_len_max, 1]), axis=1, name="context") print("context.get_shape() : %s" % (self.context.get_shape())) assert self.context.get_shape()[1] == self.feat_dim with tf.variable_scope("dense_layer"): self.W_out1 = tf.get_variable("W_out1", [self.feat_dim, self.fc_dim], initializer=self.xavier_init) self.bias_out1 = tf.get_variable("bias_out1", [self.fc_dim]) self.W_out2 = tf.get_variable("W_out2", [self.fc_dim, self.target_size], initializer=self.xavier_init) self.bias_out2 = tf.get_variable("bias_out2", [self.target_size]) self.fc = tf.nn.xw_plus_b(self.context, self.W_out1, self.bias_out1) self.fc = tf.tanh(self.fc) print("fc.get_shape() : %s" % (self.fc.get_shape())) self.y_hat = tf.nn.xw_plus_b(self.fc, self.W_out2, self.bias_out2, name="y_hat") print("y_hat.get_shape() : %s" % (self.y_hat.get_shape())) with tf.variable_scope("train_optimization"): self.train_vars = tf.trainable_variables() print() print("trainable_variables") for varvar in self.train_vars: print(varvar) print() self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.y_hat, labels=self.targets) self.loss = tf.reduce_mean(self.loss, name="loss") self.loss_l2 = tf.add_n([ tf.nn.l2_loss(v) for v in self.train_vars if "bias" not in v.name ]) * 0.0001 self.loss = self.loss + self.loss_l2 + self.P self.predict = tf.argmax(tf.nn.softmax(self.y_hat), 1) self.predict = tf.cast(tf.reshape(self.predict, [self.batch_size, 1]), tf.int32, name="predict") self.target_label = tf.cast( tf.reshape(self.targets, [self.batch_size, 1]), tf.int32) self.correct = tf.equal(self.predict, self.target_label) self.accuracy = tf.reduce_mean(tf.cast(self.correct, tf.float32), name="accuracy") self.global_step = tf.Variable(0, name="global_step", trainable=False) self.decay_rate = tf.maximum(0.00007, tf.train.exponential_decay( self.lr, self.global_step, 1000, 0.9, staircase=True), name="decay_rate") self.opt = tf.train.AdamOptimizer( learning_rate=self.decay_rate) self.grads_and_vars = self.opt.compute_gradients( self.loss, self.train_vars) self.grads_and_vars = [(tf.clip_by_norm(g, 0.5), v) for g, v in self.grads_and_vars] self.grads_and_vars = [ (tf.add(g, tf.random_normal(tf.shape(g), stddev=0.001)), v) for g, v in self.grads_and_vars ] self.train_op = self.opt.apply_gradients( self.grads_and_vars, global_step=self.global_step, name="train_op") # Summaries for loss and lr self.loss_summary = tf.summary.scalar("loss", self.loss) self.accuracy_summary = tf.summary.scalar("accuracy", self.accuracy) self.lr_summary = tf.summary.scalar("lr", self.decay_rate) # Output directory for models and summaries timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") self.out_dir = os.path.abspath( os.path.join("./model/rnn_self_att", timestamp)) print("LOGDIR = %s" % self.out_dir) print() # Train Summaries self.train_summary_op = tf.summary.merge( [self.loss_summary, self.accuracy_summary, self.lr_summary]) self.train_summary_dir = os.path.join(self.out_dir, "summary", "train") self.train_summary_writer = tf.summary.FileWriter( self.train_summary_dir, self.sess.graph) # Test summaries self.test_summary_op = tf.summary.merge( [self.loss_summary, self.accuracy_summary, self.lr_summary]) self.test_summary_dir = os.path.join(self.out_dir, "summary", "test") self.test_summary_writer = tf.summary.FileWriter( self.test_summary_dir, self.sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it self.checkpoint_dir = os.path.abspath( os.path.join(self.out_dir, "checkpoints")) self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "model-step") if self.makedir: if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=None)
for i in r: sample = data[i: i + long] a.append(sample[:-1, :5]) b.append(sample[:-1, 5:10]) c.append(sample[-1][1]) return a, b, c x = tf.placeholder(shape=[batch_size, long - 1, 5], dtype=tf.float16) y = tf.placeholder(shape=[batch_size, long - 1, 5], dtype=tf.float16) z_ = tf.placeholder(shape=[batch_size], dtype=tf.float16) X = tf.nn.sigmoid(x) - 0.5 Y = tf.nn.sigmoid(y) - 0.5 gru_x = GRUCell(num_units=8, reuse=tf.AUTO_REUSE, activation=tf.nn.elu) state_x = gru_x.zero_state(batch_size, dtype=tf.float16) with tf.variable_scope('RNN_x'): for timestep in range(long - 1): if timestep == 1: tf.get_variable_scope().reuse_variables() (cell_output_x, state_x) = gru_x(X[:, timestep], state_x) out_put_x = state_x gru_y = GRUCell(num_units=8, reuse=tf.AUTO_REUSE, activation=tf.nn.elu) state_y = gru_y.zero_state(batch_size, dtype=tf.float16) with tf.variable_scope('RNN_y'): for timestep in range(long - 1): # be careful if timestep == 1: tf.get_variable_scope().reuse_variables() (cell_output_y, state_y) = gru_y(Y[:, timestep], state_y)
num_samples = tf.shape(inputs)[0] # useful for later # embedding We = np.random.randn(V, embedding_dim).astype(np.float32) # output layer Wo = init_weight(hidden_layer_size, K).astype(np.float32) bo = np.zeros(K).astype(np.float32) # make them tensorflow variables tfWe = tf.Variable(We) tfWo = tf.Variable(Wo) tfbo = tf.Variable(bo) # make the rnn unit rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) # get the output x = tf.nn.embedding_lookup(tfWe, inputs) # converts x from a tensor of shape N x T x D # into a list of length T, where each element is a tensor of shape N x D x = tf.unstack(x, sequence_length, 1) # get the rnn output outputs, states = get_rnn_output(rnn_unit, x, dtype=tf.float32) # outputs are now of size (T, N, M) # so make it (N, T, M)
def cbhg(inputs, input_lengths, is_training, bank_size, bank_channel_size, maxpool_width, highway_depth, rnn_size, proj_sizes, proj_width, scope, before_highway=None, encoder_rnn_init_state=None): """ Args: inputs: input tensor input_lengths: length of input tensor is_training: Batch Normalization option in Conv1D scope: network or model name K: kernel size range projections: projection layers option depth: dimensionality option of Highway net and Bidirectical GRU's output The layers in the code are staked in the order in which they came out. """ batch_size = tf.shape(inputs)[0] with tf.variable_scope(scope): with tf.variable_scope('conv_bank'): conv_outputs = tf.concat( [ conv1d(inputs, k, 128, tf.nn.relu, is_training, 'conv1d_%d' % k) for k in range(1, bank_size + 1) ], #1D Convolution layers using multiple types of Convolution Kernel. axis=-1 #Iterate K with increasing filter size by 1. ) # Convolution bank: concatenate on the last axis to stack channels from all convolutions # Maxpooling: maxpool_output = tf.layers.max_pooling1d( conv_outputs, pool_size=maxpool_width, strides=1, padding='same') #1D Maxpooling layer(strides=1, width=2) # Two projection layers: proj1_output = conv1d(maxpool_output, proj_width, projections[0], tf.nn.relu, is_training, 'proj_1') #1st Conv1D projections proj2_output = conv1d(proj1_output, proj_width, projections[1], None, is_training, 'proj_2') #2nd Conv1D projections # Residual connection: if before_highway is not None: expanded_before_highway = tf.expand_dims(before_highway, [1]) tiled_before_highway = tf.tile(expanded_before_highway, [1, tf.shape(proj2_out)[1], 1]) highway_input = proj2_out + inputs + tiled_before_highway else: highway_input = proj2_out + inputs # Handle dimensionality mismatch: if highway_input.shape[2] != rnn_size: highway_input = tf.layers.dense(highway_input, rnn_size) # 4-layer HighwayNet: for idx in range(highway_depth): highway_input = highwaynet(highway_input, 'highway_%d' % (idx + 1)) #make 4 Highway net layers rnn_input = highway_input # Bidirectional RNN if encoder_rnn_init_state is not None: initial_state_fw, initial_state_bw = tf.split( encoder_rnn_init_state, 2, 1) else: initial_state_fw, initial_state_bw = None, None outputs, states = tf.nn.bidirectional_dynamic_rnn( #make Bidirectional GRU GRUCell(rnn_size), GRUCell(rnn_size), rnn_input, sequence_length=input_lengths, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, dtype=tf.float32) return tf.concat( outputs, axis=2) # Concat forward sequence and backward sequence
def GNN(label, data, batch_size, hidden_size, n_steps, num_category, graph): gru_cell = GRUCell(hidden_size) w_in = weights('in_' + label, hidden_size, 0) h0 = tf.reshape( tf.matmul(data[:, 0, :], w_in), [batch_size, hidden_size]) #initialize h0 [batchsize, hidden_state] for i in range(1, num_category): w_in = weights('in_' + label, hidden_size, i) h0 = tf.concat([ h0, tf.reshape(tf.matmul(data[:, i, :], w_in), [batch_size, hidden_size]) ], 1) h0 = tf.reshape(h0, [batch_size, num_category, hidden_size ]) # h0: [batchsize, num_category, hidden_state] ini = h0 h0 = tf.nn.tanh(h0) state = h0 sum_graph = tf.reduce_sum(graph, reduction_indices=1) enable_node = tf.cast(tf.cast(sum_graph, dtype=bool), dtype=tf.float32) with tf.variable_scope("gnn"): for step in range(n_steps): if step > 0: tf.get_variable_scope().reuse_variables() # state = state * mask_x x = message_pass(label, state, hidden_size, batch_size, num_category, graph) # x = tf.reshape(x, [batch_size*num_category, hidden_size]) # state = tf.reshape(state, [batch_size*num_category, hidden_size]) (x_new, state_new) = gru_cell(x[0], state[0]) state_new = tf.transpose(state_new, (1, 0)) state_new = tf.multiply(state_new, enable_node[0]) state_new = tf.transpose(state_new, (1, 0)) for i in range(1, batch_size): (x_, state_) = gru_cell( x[i], state[i]) # #input of GRUCell must be 2 rank, not 3 rank state_ = tf.transpose(state_, (1, 0)) state_ = tf.multiply(state_, enable_node[i]) state_ = tf.transpose(state_, (1, 0)) state_new = tf.concat([state_new, state_], 0) # x = tf.reshape(x, [batch_size, num_category, hidden_size]) state = tf.reshape(state_new, [batch_size, num_category, hidden_size ]) # #restore: 2 rank to 3 rank # state = state * mask_x # state = tf.nn.dropout(state, keep_prob) # w_out_image = weights('out_image', hidden_size, 0) # b_out_image = biases('out_image', hidden_size, 0) # output = tf.reshape(tf.matmul(state[:, 0, :], w_out_image) + b_out_image, [batch_size, 2048]) #initialize output : [batchsize, 2048] # for i in range(1, num_category): # w_out_image = weights('out_image', hidden_size, i) # b_out_image = biases('out_image', hidden_size, i) # output = tf.concat([output, tf.reshape( # tf.matmul(state[:, i, :], w_out_image) + b_out_image, # [batch_size, 2048])], 1) # output = tf.reshape(output, [batch_size, num_category, 2048]) # output = tf.nn.tanh(output) return state, ini
def __graph__(): with tf.name_scope('input'): x_input = tf.placeholder( dtype=tf.float32, shape=[None, sequence_width, sequence_height], name='x_input') y_input = tf.placeholder(dtype=tf.float32, shape=[None, num_classes], name='y_input') # state = tf.placeholder(dtype=tf.float32, shape=[None, self.cell_size * self.num_layers], # name='initial_state') p_keep = tf.placeholder(dtype=tf.float32, name='p_keep') learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate') hidden_size = int(sequence_width) # seq_len = tf.Variable(tf.constant(hidden_size),name='seq_len') rnn_outputs, _ = bi_rnn(GRUCell(hidden_size), GRUCell(hidden_size), inputs=x_input, sequence_length=None, dtype=tf.float32) tf.summary.histogram('RNN_outputs', rnn_outputs) # Attention layer with tf.name_scope('Attention_layer'): attention_output, alphas = attention( input=rnn_outputs, hidden_size=self.sequence_width, attention_size=ATTENTION_SIZE, return_alpha=True) tf.summary.histogram('alphas', alphas) # dropout drop = tf.nn.dropout(attention_output, keep_prob=p_keep) # fully connected layer with tf.name_scope('Fully_connected_layer'): W = tf.Variable(tf.truncated_normal( [hidden_size * 2, self.num_classes], stddev=0.1), name='W') b = tf.Variable(tf.constant(0.0, shape=[self.num_classes]), name='b') y_hat = tf.nn.xw_plus_b(drop, W, b) # y_hat=tf.squeeze(y_hat) tf.summary.histogram('W', W) with tf.name_scope('loss'): loss = svm_loss(labels=y_input, logits=y_hat, num_classes=self.num_classes, penalty_parameter=self.svm_c, weight=W) tf.summary.scalar('loss', loss) optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss=loss) with tf.name_scope('accuracy'): predicted_class = tf.sign(y_hat) predicted_class = tf.identity(predicted_class, name='predicted_class') with tf.name_scope('correct_prediction'): correct = tf.equal(tf.argmax(predicted_class, 1), tf.argmax(y_input, 1)) with tf.name_scope('accuracy'): accuracy = tf.reduce_mean(tf.cast(correct, 'float')) tf.summary.scalar('accuracy', accuracy) merged = tf.summary.merge_all() # set class properties self.x_input = x_input self.y_input = y_input self.p_keep = p_keep self.loss = loss self.optimizer = optimizer # self.state=state # self.states=states self.learning_rate = learning_rate self.predicted_class = predicted_class self.accuracy = accuracy self.merged = merged
def main(model, T, n_iter, n_batch, n_hidden, capacity, comp, FFT, learning_rate, decay, learning_rate_decay, norm, grid_name): learning_rate = float(learning_rate) decay = float(decay) # --- Set data params ---------------- n_input = 10 n_output = 9 n_sequence = 10 n_train = n_iter * n_batch n_test = n_batch n_steps = T + 20 n_classes = 9 # --- Create data -------------------- train_x, train_y = copying_data(T, n_train, n_sequence) test_x, test_y = copying_data(T, n_test, n_sequence) # --- Create graph and compute gradients ---------------------- with tf.name_scope('inputs'): x = tf.placeholder("int32", [None, n_steps], name='x_input') y = tf.placeholder("int64", [None, n_steps], name='y_input') input_data = tf.one_hot(x, n_input, dtype=tf.float32) # --- Input to hidden layer ---------------------- #with tf.name_scope('layer'): if model == "LSTM": cell = BasicLSTMCell(n_hidden, state_is_tuple=True, forget_bias=1) hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32) elif model == "GRU": cell = GRUCell(n_hidden, kernel_initializer=tf.orthogonal_initializer()) hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32) elif model == "RUM": cell = RUMCell(n_hidden, T_norm=norm) hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32) elif model == "ARUM": cell = ARUMCell(n_hidden, T_norm=norm) hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32) elif model == "EUNN": cell = EUNNCell(n_hidden, capacity, FFT, comp) hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32) elif model == "GORU": cell = GORUCell(n_hidden, capacity, FFT) hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32) elif model == "RNN": cell = BasicRNNCell(n_hidden) hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32) # --- Hidden Layer to Output ---------------------- V_init_val = np.sqrt(6.) / np.sqrt(n_output + n_input) V_weights = tf.get_variable("V_weights", shape=[n_hidden, n_classes], dtype=tf.float32, initializer=tf.random_uniform_initializer( -V_init_val, V_init_val)) V_bias = tf.get_variable("V_bias", shape=[n_classes], dtype=tf.float32, initializer=tf.constant_initializer(0.01)) hidden_out_list = tf.unstack(hidden_out, axis=1) temp_out = tf.stack([tf.matmul(i, V_weights) for i in hidden_out_list]) output_data = tf.nn.bias_add(tf.transpose(temp_out, [1, 0, 2]), V_bias) # --- evaluate process ---------------------- with tf.name_scope('evaluate'): with tf.name_scope('cost'): cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=output_data, labels=y)) tf.summary.scalar('cost', cost) with tf.name_scope('correnct_pred'): correct_pred = tf.equal(tf.argmax(output_data, 2), y) with tf.name_scope('accuracy'): accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) tf.summary.scalar('accuracy', accuracy) # --- Initialization ---------------------- optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=decay).minimize(cost) init = tf.global_variables_initializer() print("\n###") sumz = 0 for i in tf.global_variables(): print(i.name, i.shape, np.prod(np.array(i.get_shape().as_list()))) sumz += np.prod(np.array(i.get_shape().as_list())) print("# parameters: ", sumz) print("###\n") # --- save result ---------------------- filename = "./output/copying/" if grid_name != None: filename += grid_name + "/" filename += "T=" + str(T) + "/" research_filename = filename + "researchModels" + "/" + model + "_N=" + str( n_hidden) + "_lambda=" + str(learning_rate) + "_decay=" + str( decay) + "/" filename += model + "_N=" + str(n_hidden) + "_lambda=" + str( learning_rate) + "_decay=" + str(decay) if norm is not None: filename += "_norm=" + str(norm) filename = filename + ".txt" if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise if not os.path.exists(os.path.dirname(research_filename)): try: os.makedirs(os.path.dirname(research_filename)) except OSError as exc: if exc.errno != errno.EEXIST: raise if not os.path.exists( os.path.dirname(research_filename + "/modelCheckpoint/")): try: os.makedirs( os.path.dirname(research_filename + "/modelCheckpoint/")) except OSError as exc: if exc.errno != errno.EEXIST: raise f = open(filename, 'w') f.write("########\n\n") f.write("## \tModel: %s with N=%d" % (model, n_hidden)) f.write("\n\n") f.write("########\n\n") # --- Training Loop ---------------------- saver = tf.train.Saver() mx2 = 0 step = 0 with tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=False)) as sess: merged = tf.summary.merge_all() writer = tf.summary.FileWriter("./logs/", sess.graph) sess.run(init) steps = [] losses = [] accs = [] while step < n_iter: batch_x = train_x[step * n_batch:(step + 1) * n_batch] batch_y = train_y[step * n_batch:(step + 1) * n_batch] sess.run(optimizer, feed_dict={x: batch_x, y: batch_y}) result = sess.run(merged, feed_dict={x: batch_x, y: batch_y}) writer.add_summary(result, step) result = sess.run(merged, feed_dict={x: batch_x, y: batch_y}) writer.add_summary(result, step) #with tf.name_scope('loss'): with tf.name_scope('loss'): with tf.name_scope('acc'): acc = sess.run(accuracy, feed_dict={ x: batch_x, y: batch_y }) with tf.name_scope('loss'): loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y}) tf.summary.scalar('loss', loss) merged = tf.summary.merge_all() write = tf.summary.FileWriter("logs/", sess.graph) result = sess.run(merged, feed_dict={x: batch_x, y: batch_y}) writer.add_summary(result, step) print("Iter " + str(step) + ", Minibatch Loss= " + \ "{:.6f}".format(loss) + ", Training Accuracy= " + \ "{:.5f}".format(acc)) steps.append(step) losses.append(loss) accs.append(acc) if step == 0: f.write("%d\t%f\t%f\n" % (step, loss, acc)) step += 1 if step % 200 == 199: f.write("%d\t%f\t%f\n" % (step, loss, acc)) if step % 10000 == 0: saver.save(sess, research_filename + "/modelCheckpoint/") if step % 1000 == 0: if model == "GRU": tmp = "gru" if model == "RUM": tmp = "rum" if model == "ARUM": tmp = "arum" if model == "GRU" or model == "RUM" or model == "ARUM": kernel = [ v for v in tf.global_variables() if v.name == "rnn/" + tmp + "_cell/gates/kernel:0" ][0] bias = [ v for v in tf.global_variables() if v.name == "rnn/" + tmp + "_cell/gates/bias:0" ][0] k, b = sess.run([kernel, bias]) np.save(research_filename + "/kernel_" + str(step), k) np.save(research_filename + "/bias_" + str(step), b) if model == "RUM" or model == "ARUM": kernel_emb = [ v for v in tf.global_variables() if v.name == "rnn/" + tmp + "_cell/candidate/kernel:0" ][0] bias_emb = [ v for v in tf.global_variables() if v.name == "rnn/" + tmp + "_cell/candidate/bias:0" ][0] k_emb, b_emb = sess.run([kernel_emb, bias_emb]) np.save(research_filename + "/kernel_emb_" + str(step), k_emb) np.save(research_filename + "/bias_emb_" + str(step), b_emb) #result = sess.run(merged,feed_dict={x: batch_x, y: batch_y}) #writer.add_summary(result, step) print("Optimization Finished!") # --- test ---------------------- test_acc = sess.run(accuracy, feed_dict={x: test_x, y: test_y}) test_loss = sess.run(cost, feed_dict={x: test_x, y: test_y}) #tf.scalar_summary('test_loss',test_loss) #result = sess.run(merged,feed_dict={x: batch_x, y: batch_y}) #writer.add_summary(result, step) f.write("Test result: Loss= " + "{:.6f}".format(test_loss) + \ ", Accuracy= " + "{:.5f}".format(test_acc))
def __init__(self, sequence_length, num_classes, text_vocab_size, text_embedding_size, hidden_size=800, l2_reg_lambda=0.0): # Placeholders for input, output and dropout self.input_text = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_text') self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y') self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob') self.dropout_keep_prob_lstm = tf.placeholder(tf.float32, name='dropout_keep_prob') # Keeping track of l2 regularization loss (optional) l2_loss = tf.constant(0.0) # Embedding layer with tf.device('/cpu:0'), tf.name_scope("text-embedding"): self.W_text = tf.Variable(tf.random_uniform( [text_vocab_size, text_embedding_size], -1.0, 1.0), name="W_text") self.text_embedded_chars = tf.nn.embedding_lookup( self.W_text, self.input_text) # (Bi-)RNN layer(-s) self.rnn_outputs, _ = bi_rnn( tf.nn.rnn_cell.DropoutWrapper(GRUCell(hidden_size), self.dropout_keep_prob_lstm), tf.nn.rnn_cell.DropoutWrapper(GRUCell(hidden_size), self.dropout_keep_prob_lstm), inputs=self.text_embedded_chars, dtype=tf.float32) print(self.rnn_outputs) tf.summary.histogram('RNN_outputs', self.rnn_outputs) # 双向tensor拼接 rnn_outputs = tf.concat([self.rnn_outputs[0], self.rnn_outputs[1]], 2) # 降维 rnn_outputs = tf.reduce_sum(rnn_outputs, 1) # Dropout self.drop = tf.nn.dropout(rnn_outputs, self.dropout_keep_prob) # Fully connected layer with tf.name_scope('Fully_connected_layer'): W = tf.Variable( tf.truncated_normal( [hidden_size * 2, num_classes], stddev=0.1)) # Hidden size is multiplied by 2 for Bi-RNN b = tf.Variable(tf.constant(0., shape=[num_classes])) l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) self.scores = tf.nn.xw_plus_b(self.drop, W, b, name="scores") self.predictions = tf.argmax(self.scores, 1, name="predictions") # Calculate mean cross-entropy loss with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
def __init__(self, config, is_training=False): self.config = config self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps self.hidden_size = hidden_size = config.hidden_size self.num_layers = 1 vocab_size = config.vocab_size self.max_grad_norm = config.max_grad_norm self.use_lstm = config.use_lstm # Placeholders for inputs. self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self.targets = tf.placeholder(tf.int32, [batch_size, num_steps]) self.initial_state = array_ops.zeros( tf.stack([self.batch_size, self.num_steps]), dtype=tf.float32).set_shape([None, self.num_steps]) embedding = tf.get_variable( 'embedding', [self.config.vocab_size, self.config.hidden_size]) # Set up ACT cell and inner rnn-type cell for use inside the ACT cell. with tf.variable_scope("rnn"): if self.use_lstm: inner_cell = BasicLSTMCell(self.config.hidden_size) else: inner_cell = GRUCell(self.config.hidden_size) with tf.variable_scope("ACT"): act = ACTCell(self.config.hidden_size, inner_cell, config.epsilon, max_computation=config.max_computation, batch_size=self.batch_size) inputs = tf.nn.embedding_lookup(embedding, self.input_data) inputs = [ tf.squeeze(single_input, [1]) for single_input in tf.split(inputs, self.config.num_steps, 1) ] self.outputs, final_state = static_rnn(act, inputs, dtype=tf.float32) # Softmax to get probability distribution over vocab. output = tf.reshape(tf.concat(self.outputs, 1), [-1, hidden_size]) softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) self.logits = tf.matmul( output, softmax_w) + softmax_b # dim (numsteps*batchsize, vocabsize) loss = sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([batch_size * num_steps])], vocab_size) # Add up loss and retrieve batch-normalised ponder cost: sum N + sum Remainder. ponder_cost = act.calculate_ponder_cost( time_penalty=self.config.ponder_time_penalty) self.cost = (tf.reduce_sum(loss) / batch_size) + ponder_cost self.final_state = self.outputs[-1] if is_training: self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.config.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, tvars))