def BLSTM(self, x_p, x_s, x_d, x_b, x_len, dropout, activation=tf.nn.tanh):
        '''p=pitch, s=start, d=duration, b=beat_type'''
        with tf.name_scope('Input_embedding'):
            x_p_onehot = tf.one_hot(x_p - self.lowest_pitch,
                                    depth=self.n_p_classes)
            x_b_onehot = tf.one_hot(x_b, depth=self.n_b_classes)
            input = tf.concat(
                [x_p_onehot, x_s[:, :, None], x_d[:, :, None], x_b_onehot],
                axis=2)
            input_embedded = tf.layers.dense(input, self.embedding_size)
            input_embedded = self.normalize(input_embedded, scope='input_ln')
            input_embedded = activation(input_embedded)
            input_embedded = tf.nn.dropout(input_embedded,
                                           keep_prob=1 - dropout)

        with tf.name_scope('BLSTM_cells'):
            cell_fw = LSTMBlockCell(
                num_units=self.hidden_size, name='cell_fw'
            )  # LSTMCell(num_units=hidden_size, name='cell_fw')
            cell_bw = LSTMBlockCell(
                num_units=self.hidden_size, name='cell_bw'
            )  # LSTMCell(num_units=hidden_size, name='cell_bw')

        with tf.name_scope('RNN'):
            # bi-LSTM
            (output_fw, output_bw), (_, _) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=cell_fw,
                cell_bw=cell_bw,
                inputs=input_embedded,
                sequence_length=x_len,
                dtype=tf.float32,
                time_major=False)
            hidden_states = tf.concat((output_fw, output_bw), axis=-1)

            hidden_states = self.normalize(hidden_states, scope='hidden_ln')
            hidden_states = activation(hidden_states)
            hidden_states = tf.nn.dropout(hidden_states, keep_prob=1 - dropout)

        with tf.name_scope('Output'):
            s_logits = tf.layers.dense(hidden_states,
                                       self.n_str_classes,
                                       name='string_out')
            p_logits = tf.layers.dense(hidden_states,
                                       self.n_pos_classes,
                                       name='position_out')

        return s_logits, p_logits
Beispiel #2
0
    def __init__(self, hparams, is_training=False, with_target=True, reuse=False):
        self.with_target = with_target
        self.hparams = hparams
        self.inputs = tf.placeholder(tf.int32, (None, None), name='graphemes_ph')
        self.input_lengths = tf.placeholder(tf.int32, [None], name='grapeheme_seq_len_ph')
        if with_target:
            self.targets = tf.sparse_placeholder(tf.int32, name='phonemes_ph')

        with tf.variable_scope('g2p', reuse=reuse):
            embedding_table = tf.get_variable('embedding',
                                              [hparams.graphemes_num, hparams.embedding_dim],
                                              dtype=tf.float32,
                                              initializer=tf.truncated_normal_initializer(stddev=0.5))
            outputs = tf.nn.embedding_lookup(embedding_table, self.inputs)

            if hparams.with_conv:
                for i in range(hparams.conv_num):
                    outputs = conv1d(outputs, hparams.conv_width,
                                     hparams.conv_channels, tf.nn.relu,
                                     is_training, hparams.dropout_rate,
                                     'conv_%d' % i)

            forward_cell = rnn_cell(hparams.lstm_units1//2, hparams, is_training)
            backward_cell = rnn_cell(hparams.lstm_units1//2, hparams, is_training)
            bi_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                forward_cell, backward_cell,
                outputs, sequence_length=self.input_lengths, dtype=tf.float32,
                scope='bilstm')

            # Concatentate forward and backwards:
            bi_outputs = tf.concat(bi_outputs, axis=2)

            uni_cell = rnn_cell(hparams.lstm_units1, hparams, is_training)

            uni_outputs, _ = tf.nn.dynamic_rnn(uni_cell, outputs,
                                               sequence_length=self.input_lengths,
                                               dtype=tf.float32,
                                               scope='unilstm')
            outputs = tf.concat([bi_outputs, uni_outputs], axis=2)

            dropout_rate_cond = hparams.dropout_rate if is_training else 0.0
            outputs = tf.layers.dropout(outputs, rate=dropout_rate_cond)

            outputs, _ = tf.nn.dynamic_rnn(LSTMBlockCell(hparams.lstm_units2),
                                           outputs,
                                           sequence_length=self.input_lengths,
                                           dtype=tf.float32,
                                           scope='lstm')
            self.logits = tf.layers.dense(outputs, hparams.phonemes_num)
            self.probs = tf.nn.softmax(self.logits, name='probs')

            logits_transp = tf.transpose(self.logits, (1, 0, 2))
            self.decoded, self.seq_probs = tf.nn.ctc_beam_search_decoder(
                logits_transp, self.input_lengths, top_paths=self.hparams.nbest)
            self.decoded_best = tf.to_int32(tf.sparse_tensor_to_dense(
                self.decoded[0], default_value=hparams.phonemes_num-1),
                name='predicted_1best')
Beispiel #3
0
def _build_single_cell(cell_type,
                       num_units,
                       use_dropout,
                       mode,
                       dropout_probability,
                       dtype,
                       device=None):
    r"""

    :param num_units: `int`
    :return:
    """
    if cell_type == 'lstm':
        cells = LSTMCell(num_units=num_units,
                         use_peepholes=False,
                         cell_clip=1.0,
                         initializer=tf.variance_scaling_initializer(),
                         dtype=dtype)
    elif cell_type == 'layernorm_lstm':
        cells = LayerNormLSTMCell(num_units=num_units, cell_clip=1.0)
    elif cell_type == 'layernorm_basiclstm':
        cells = LayerNormBasicLSTMCell(num_units=num_units)
    elif cell_type == 'gru':
        cells = GRUCell(num_units=num_units,
                        kernel_initializer=tf.variance_scaling_initializer(),
                        bias_initializer=tf.variance_scaling_initializer(),
                        dtype=dtype)
    elif cell_type == 'ugrnn':
        cells = UGRNNCell(num_units)
    elif cell_type == 'lstm_block':
        cells = LSTMBlockCell(num_units=num_units,
                              use_peephole=True,
                              cell_clip=None)
    elif cell_type == 'gru_block':
        cells = GRUBlockCellV2(num_units=num_units)
    elif cell_type == 'nas':
        cells = NASCell(num_units=num_units)
    elif cell_type == 'lstm_masked':
        from tensorflow.contrib.model_pruning import MaskedLSTMCell
        cells = MaskedLSTMCell(num_units=num_units)
    else:
        raise Exception('cell type not supported: {}'.format(cell_type))

    if use_dropout is True and mode == 'train':
        cells = DropoutWrapper(
            cells,
            input_keep_prob=dropout_probability[0],
            state_keep_prob=dropout_probability[1],
            output_keep_prob=dropout_probability[2],
            variational_recurrent=False,
            dtype=dtype,
            # input_size=self._inputs.get_shape()[1:],
        )
    if device is not None:
        cells = DeviceWrapper(cells, device=device)

    return cells
Beispiel #4
0
def RNN(x, weights, biases):
    x = tf.unstack(x, timesteps, 1)
    lstm_cell = DropoutWrapper(LSTMBlockCell(n_hidden, forget_bias=1.0),
                               variational_recurrent=True,
                               input_size=x_train.shape[0],
                               state_keep_prob=.7,
                               output_keep_prob=.7,
                               dtype=tf.float32)
    outputs, states = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
    return tf.matmul(outputs[-1], weights['out']) + biases['out']
Beispiel #5
0
def rnn_cell(dim, hparams, is_training):
    if hparams.rnn_type == 'ln_lstm':
        keep_prob = (1 - hparams.dropout_rate) if is_training else 1.0
        cell = LayerNormBasicLSTMCell(dim,
                                      dropout_keep_prob=keep_prob,
                                      layer_norm=True)
    elif hparams.rnn_type == 'zn_lstm':
        cell = LSTMBlockCell(dim)
        cell = ZoneoutWrapper(cell, hparams.zonout_prob, is_training)
    return cell
Beispiel #6
0
def attention_decoder(inputs,
                      num_units,
                      input_lengths,
                      is_training,
                      speaker_embd=None,
                      attention_type="bah",
                      scope="attention_decoder",
                      reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        if attention_type == 'bah_mon':
            attention_mechanism = tf.contrib.seq2seq.BahdanauMonotonicAttention(
                num_units, inputs)
        elif attention_type == 'bah_norm':
            attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                num_units, inputs, normalize=True)
        elif attention_type == 'luong_scaled':
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units,
                                                                    inputs,
                                                                    scale=True)
        elif attention_type == 'luong':
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                num_units, inputs)
        elif attention_type == 'bah':
            # Bahdanau et al. attention mechanism
            attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                num_units,  # attention units
                inputs,
                memory_sequence_length=input_lengths)
        elif attention_type == "location_sensitive":
            attention_mechanism = LocationSensitiveAttention(
                num_units, inputs, memory_sequence_length=input_lengths)
        else:
            raise Exception("Unknown attention type ")

        # Attention
        if attention_type == "location_sensitive":
            pre_mechanism_cell = LSTMBlockCell(num_units)
        else:
            pre_mechanism_cell = GRUCell(num_units)

        # bottleneck prenet as in paper
        pre_mechanism = neural_speech.models.utils.rnn_wrappers.PrenetWrapper(
            pre_mechanism_cell, [256, 128],
            is_training,
            speaker_embd=speaker_embd)
        attention_cell = tf.contrib.seq2seq.AttentionWrapper(
            pre_mechanism,  # 256
            attention_mechanism,  # 256
            alignment_history=True,
            output_attention=False)  # [N, T_in, 256]
        #  Concatenate attention context vector and RNN cell output into a 512D vector.
        concat_cell = neural_speech.models.utils.rnn_wrappers.ConcatOutputAndAttentionWrapper(
            attention_cell)  # [N, T_in, 512]
        return concat_cell
Beispiel #7
0
def conv_and_lstm(inputs, input_lengths, conv_layers, conv_width,
                  conv_channels, lstm_units, is_training, scope):
    # Convolutional layers
    with tf.variable_scope(scope):
        x = inputs
        for i in range(conv_layers):
            activation = tf.nn.relu if i < conv_layers - 1 else None
            x = conv1d(x, conv_width, conv_channels, activation, is_training,
                       'conv_%d' % i)

        # 2-layer bidirectional LSTM:
        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            LSTMBlockCell(lstm_units),
            LSTMBlockCell(lstm_units),
            x,
            sequence_length=input_lengths,
            dtype=tf.float32,
            scope='encoder_lstm')

        # Concatentate forward and backwards:
        return tf.concat(outputs, axis=2)
Beispiel #8
0
def BiLSTM_classifier(X,
                      num_hidden,
                      num_classes,
                      seq_lens,
                      fold,
                      istate_fw=None,
                      istate_bw=None,
                      dtype=tf.float32):
    #X is of shape batch_size x seq_lens x num_feats
    #use variable scope to put the variable for each fold in a different namespace
    with tf.variable_scope(str(fold)):
        lstm_fw_cell = LSTMBlockCell(num_hidden)
        lstm_bw_cell = LSTMBlockCell(num_hidden)

        outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,
                                                     lstm_bw_cell,
                                                     X,
                                                     sequence_length=seq_lens,
                                                     dtype=tf.float32)
        #concatenate both forward and backward outputs fromt the BiLSTM
        #should consist of 1024 outputs if 512 hidden layer size
        X = tf.concat(outputs, axis=2)
        #take first and last of the 1024 outputs
        #need to find last based on seq lengths

        first = X[:, 0, :]  #batchsize x 1 x 1024
        #last = X[:,-1,:] #may not support; should be batchsize x 1 x 1024;
        last = last_relevant(
            X, seq_lens
        )  #from https://danijar.com/variable-sequence-lengths-in-tensorflow
        #print("Printing last")
        #print(last)
        #might need function like last_relevant
        X = tf.concat([first, last], axis=1)  #batchsize x 1 x 2048
        weights, biases = weight_and_bias(X.get_shape().as_list()[1],
                                          num_classes)
        #set the below to work for each element of the batch
        prediction = tf.matmul(X, weights) + biases
        return prediction
Beispiel #9
0
def conv_and_lstm(inputs, input_lengths, conv_layers, conv_width,
                  conv_channels, lstm_units_unidirectional, is_training,
                  scope):
    with tf.variable_scope(scope):
        # convolutional layers
        convolved_inputs = inputs

        for i in range(conv_layers):
            activation = tf.nn.relu if i < conv_layers - 1 else None
            convolved_inputs = conv1d(convolved_inputs, conv_width,
                                      conv_channels, activation, is_training,
                                      'conv_{}'.format(i))

        # bidirectional LSTM
        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            LSTMBlockCell(lstm_units_unidirectional),
            LSTMBlockCell(lstm_units_unidirectional),
            convolved_inputs,
            sequence_length=input_lengths,
            dtype=tf.float32,
            scope='{}_lstm'.format(scope))

        # concatenate forwards and backwards
        return tf.concat(outputs, axis=2)
        def make_cell(num_units, residual):
            if self.rnn_type == 'gru':
                print("GRU")
                cell = GRUCell(num_units)
            else:
                if self.layer_norm:
                    print("LSTM With layer norm")
                    cell = LayerNormBasicLSTMCell(num_units, layer_norm=True)
                else:
                    print("LSTM Without layer norm")
                    #cell = LSTMCell(num_units)
                    cell = LSTMBlockCell(num_units)

            if residual:
                cell = ResidualWrapper(cell)
            return cell
Beispiel #11
0
def get_cell(cell_type, size, layers=1, direction='unidirectional'):
    if cell_type == "layer_norm_basic":
        cell = LayerNormBasicLSTMCell(size)
    elif cell_type == "lstm_block_fused":
        cell = tf.contrib.rnn.LSTMBlockFusedCell(size)
    elif cell_type == "cudnn_lstm":
        cell = CudnnLSTM(layers, size, direction=direction)
    elif cell_type == "cudnn_gru":
        cell = CudnnGRU(layers, size, direction=direction)
    elif cell_type == "lstm_block":
        cell = LSTMBlockCell(size)
    elif cell_type == "gru_block":
        cell = GRUBlockCell(size)
    elif cell_type == "rnn":
        cell = BasicRNNCell(size)
    elif cell_type == "cudnn_rnn":
        cell = CudnnRNNTanh(layers, size)
    else:
        cell = BasicLSTMCell(size)
    return cell
    def build(self):
        self.lstm_cell = LSTMBlockCell(
            self.units,
            #use_peepholes=self.peephole,
            use_peephole=True)
        #initializer=tf.initializers.random_uniform(minval=self.minval,
        #                                           maxval=self.maxval))

        self.va_lstm_cell = DropoutWrapper(self.lstm_cell,
                                           variational_recurrent=True,
                                           input_keep_prob=0.7,
                                           output_keep_prob=0.7,
                                           state_keep_prob=0.7,
                                           dtype=tf.float32,
                                           input_size=self.inputSize)

        tf.nn.dynamic_rnn(self.va_lstm_cell,
                          tf.random_normal((1, 1, self.inputSize)),
                          dtype=tf.float32)

        self._trainable_weights = self.lstm_cell.variables
Beispiel #13
0
def get_rnn_cell_list(config, name, reuse=False, seed=123, dtype=tf.float32):
    cell_list = []
    for i, units in enumerate(config['num_units']):
        cell = None
        if config['cell_type'] == 'clstm':
            cell = CustomLSTMCell(units, layer_norm=config['layer_norm'], activation=config['activation'], seed=seed,
                                  reuse=reuse, dtype=dtype, name='{}_{}'.format(name, i))
        elif config['cell_type'] == 'tflstm':

            act = get_activation(config['activation'])

            if config['layer_norm']:
                cell = LayerNormBasicLSTMCell(num_units=units, activation=act, layer_norm=config['layer_norm'],
                                              reuse=reuse)
            elif config['layer_norm'] == False and config['activation'] != 'tanh':
                cell = LSTMCell(num_units=units, activation=act, reuse=reuse)
            else:
                cell = LSTMBlockCell(num_units=units)
        cell_list.append(cell)

    return cell_list
    def build_model(self):
        self.X = tf.placeholder(
            tf.float32,
            [None, self.config.sequence_length, self.config.num_inputs])
        self.Y = tf.placeholder(tf.float32, [None, self.config.num_outputs])

        # x = tf.unstack(self.X, self.config.sequence_length, axis=1)

        dense1 = tf.layers.dense(self.X,
                                 self.config.E,
                                 activation=tf.math.tanh)

        m = tf.unstack(dense1, self.config.sequence_length, axis=1)

        attentive_lstm = AttentionCellWrapper(
            LSTMBlockCell(self.config.hidden_units),
            self.config.sequence_length)
        rnn_outputs, rnn_states = tf.contrib.rnn.static_rnn(attentive_lstm,
                                                            m,
                                                            dtype=tf.float32)

        dense2 = tf.layers.dense(rnn_outputs[-1], self.config.num_outputs)

        self.prediction = dense2

        with tf.name_scope("loss"):
            self.l2_regularization = self.config.lambda_l2_reg * sum(
                tf.nn.l2_loss(v) for v in tf.trainable_variables()
                if not ("Bias" or "bias" in v.name))
            self.mean_squared_error = tf.losses.mean_squared_error(
                self.Y, self.prediction)
            # self.loss = tf.losses.log_loss(self.Y, self.prediction)
            self.loss = self.mean_squared_error + self.l2_regularization

            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

            with tf.control_dependencies(update_ops):
                self.train_step = tf.train.AdamOptimizer(
                    self.config.learning_rate).minimize(
                        self.loss, global_step=self.global_step_tensor)
Beispiel #15
0
    def initialize(self, inputs, input_lengths, num_speakers, speaker_id,
            mel_targets=None, linear_targets=None, loss_coeff=None,
            rnn_decoder_test_mode=False, is_randomly_initialized=False):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            self.batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)  # [N, T_in, 512]

            # Encoder
            encoder_outputs = conv_and_lstm(
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                lstm_units=hp.encoder_lstm_units,
                is_training=is_training,
                scope='encoder')  # [N, T_in, 512]

            # Attention
            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool, shape=(), name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32, shape=[None, None, None], name="manual_alignments",
            )

            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth), is_training),
                LocationSensitiveAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 128]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                concat_cell,
                LSTMBlockCell(hp.decoder_lstm_units),
                LSTMBlockCell(hp.decoder_lstm_units)
            ], state_is_tuple=True)  # [N, T_in, 1024]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
            else:
                helper = TacoTestHelper(self.batch_size, hp.num_mels, hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=self.batch_size, dtype=tf.float32)
            (multi_decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                BasicDecoder(output_cell, helper, decoder_init_state),
                maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry                                [N, T_out, M]
            decoder_outputs = tf.reshape(multi_decoder_outputs, [self.batch_size, -1, hp.num_mels])

            # Postnet: predicts a residual
            postnet_outputs = postnet(
                decoder_outputs,
                layers=hp.postnet_conv_layers,
                conv_width=hp.postnet_conv_width,
                channels=hp.postnet_conv_channels,
                is_training=is_training)
            mel_outputs = decoder_outputs + postnet_outputs

            # Convert to linear using a similar architecture as the encoder:
            expand_outputs = conv_and_lstm(
                mel_outputs,
                None,
                conv_layers=hp.expand_conv_layers,
                conv_width=hp.expand_conv_width,
                conv_channels=hp.expand_conv_channels,
                lstm_units=hp.expand_lstm_units,
                is_training=is_training,
                scope='expand')  # [N, T_in, 512]
            linear_outputs = tf.layers.dense(expand_outputs, hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_outputs = decoder_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  expand out:              %d' % expand_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
  def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None):
    '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
    with tf.variable_scope('inference') as scope:
      is_training = linear_targets is not None
      batch_size = tf.shape(inputs)[0]
      hp = self._hparams

      # Embeddings
      embedding_table = tf.get_variable(
        'embedding', [len(symbols), 256], dtype=tf.float32,
        initializer=tf.truncated_normal_initializer(stddev=0.5))
      embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)           # [N, T_in, 256]

      # Global style tokens (GST), When using h attention heads, we set
      # the token embedding size to be 256/h and concatenate the attention
      # outputs of each head.
      gst_tokens = tf.get_variable(
        'style_tokens', [hp.num_gst, 256 // hp.num_heads], dtype=tf.float32,
        initializer=tf.truncated_normal_initializer(stddev=0.5))

      # Encoder
      prenet_outputs = prenet(embedded_inputs, is_training)                       # [N, T_in, 128]
      encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training)  # [N, T_in, 256]

      if is_training:
        # Reference encoder
        reference_embedding = reference_encoder(
          mel_targets,
          filters=[32, 32, 64, 64, 128, 128],
          kernel_size=(3, 3),
          strides=(2, 2),
          is_training=is_training)

        # Style token layer
        style_embedding = multi_head_attention(
          num_heads=hp.num_heads,
          queries=tf.expand_dims(reference_embedding, axis=1),                    # [N, 1, 128]
          memory=tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1]), # [N, hp.num_gst, 256 // hp.num_heads]
          num_units=128)
      else:
        # TODO Add support for reference mode and more effective style control during inference.
        # Randomly select style embedding from gst_tokens for simplicity.
        random_index = tf.random_uniform([batch_size], maxval=hp.num_gst, dtype=tf.int32)
        style_embedding = tf.nn.embedding_lookup(gst_tokens, random_index)

      # Add style embedding to every text encoder state, applying tanh to
      # compress both encoder state and style embedding to the same scale.
      encoder_outputs += tf.nn.tanh(style_embedding)

      # Attention
      attention_cell = AttentionWrapper(
        DecoderPrenetWrapper(GRUCell(256), is_training),
        BahdanauAttention(256, encoder_outputs, memory_sequence_length=input_lengths),
        alignment_history=True,
        output_attention=False)                                                  # [N, T_in, 256]

      # Concatenate attention context vector and RNN cell output into a 512D vector.
      concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)              # [N, T_in, 512]

      # Decoder (layers specified bottom to top):
      decoder_cell = MultiRNNCell([
        OutputProjectionWrapper(concat_cell, 256),
        ResidualWrapper(ZoneoutWrapper(LSTMBlockCell(256), (0.1, 0.1), is_training)),
        ResidualWrapper(ZoneoutWrapper(LSTMBlockCell(256), (0.1, 0.1), is_training)),
      ], state_is_tuple=True)                                                  # [N, T_in, 256]

      # Project onto r mel spectrograms (predict r outputs at each RNN step):
      output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
      decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

      if is_training:
        helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
      else:
        helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

      if is_training:
        (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
          BasicDecoder(output_cell, helper, decoder_init_state))
      else:
        (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
          BasicDecoder(output_cell, helper, decoder_init_state),
          maximum_iterations=hp.max_iters)                                      # [N, T_out/r, M*r]

      # Reshape outputs to be one output per entry
      mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels])  # [N, T_out, M]

      # Add post-processing CBHG:
      post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)           # [N, T_out, 256]
      linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)               # [N, T_out, F]

      # Grab alignments from the final decoder state:
      alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

      self.inputs = inputs
      self.input_lengths = input_lengths
      self.mel_outputs = mel_outputs
      self.linear_outputs = linear_outputs
      self.alignments = alignments
      self.mel_targets = mel_targets
      self.linear_targets = linear_targets
      tf.logging.info('Initialized Tacotron model. Dimensions: ')
      tf.logging.info('  embedding:               %d' % embedded_inputs.shape[-1])
      tf.logging.info('  prenet out:              %d' % prenet_outputs.shape[-1])
      tf.logging.info('  encoder out:             %d' % encoder_outputs.shape[-1])
      tf.logging.info('  attention out:           %d' % attention_cell.output_size)
      tf.logging.info('  concat attn & out:       %d' % concat_cell.output_size)
      tf.logging.info('  decoder cell out:        %d' % decoder_cell.output_size)
      tf.logging.info('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
      tf.logging.info('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
      tf.logging.info('  postnet out:             %d' % post_outputs.shape[-1])
      tf.logging.info('  linear out:              %d' % linear_outputs.shape[-1])
def main():
    args = parser.parse_args()

    input_size = 10 * args.total_digits
    output_size = 11 * (args.total_digits + 1)
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    use_act = args.use_act

    # Placeholders for inputs.
    x = tf.placeholder(tf.float32, [batch_size, args.sequence_length, input_size])
    y = tf.placeholder(tf.int64, [batch_size*(args.sequence_length-1)*(args.total_digits+1)])

    rnn = LSTMBlockCell(hidden_size)
    if use_act:
        inputs = [tf.squeeze(xx) for xx in tf.split(x, args.sequence_length, 1)]
        act = ACTCell(num_units=args.hidden_size, cell=rnn,
                      max_computation=20, batch_size=batch_size, state_is_tuple=True, return_ponders=args.return_ponders)
        outputs, final_state = static_rnn(act, inputs, dtype=tf.float32, initial_state=act.zero_state(args.batch_size, tf.float32))
        outputs = tf.stack(outputs, 1)
    else:
        outputs, final_state = tf.nn.dynamic_rnn(rnn, x, dtype=tf.float32)

    output = tf.reshape(outputs[:, 1:, :], [-1, hidden_size])
    softmax_w = tf.get_variable("softmax_w", [hidden_size, output_size])
    softmax_b = tf.get_variable("softmax_b", [output_size])
    logits = tf.reshape(tf.matmul(output, softmax_w) + softmax_b, [-1, 11])

    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)

    loss = tf.reduce_mean(tf.reshape(loss, [batch_size, -1]), axis=1)

    if use_act:
        if args.return_ponders:
            ponder, ponders_tensor = act.calculate_ponder_cost()
            ponders_tensor = tf.reduce_mean(ponders_tensor, axis=0)
        else:
            ponder = act.calculate_ponder_cost()
        ponder_mean = tf.reduce_mean(ponder)
        tf.summary.scalar('Ponder', ponder_mean)
        loss += args.tau*ponder

    loss = tf.reduce_mean(loss)
    tf.summary.scalar('Loss', loss)

    train_step = tf.train.AdamOptimizer(args.lr).minimize(loss)

    predicted = tf.argmax(logits, 1)
    target = tf.cast(y, tf.int64)
    correct_sequences = tf.cast(tf.reduce_all(tf.reshape(tf.equal(predicted, target),
                                [args.batch_size, (args.sequence_length-1)*(args.total_digits+1)]), axis=1), tf.float32)
    accuracy = tf.reduce_mean(correct_sequences)
    tf.summary.scalar('Accuracy', accuracy)

    merged = tf.summary.merge_all()
    logdir = './logs/addition/LR={}'.format(args.lr)
    if args.use_act:
        logdir += '_Tau={}'.format(args.tau)
    else:
        logdir += '_NoACT'
    while os.path.isdir(logdir):
        logdir += '_'
    if args.log:
        writer = tf.summary.FileWriter(logdir)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.vram_fraction)
    if args.return_ponders:
        ponders_list = list()
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        sess.run(tf.global_variables_initializer())
        loop = trange(args.steps)
        for i in loop:
            batch = generate(args)

            if i % args.log_interval == 0:
                if use_act:
                    if args.return_ponders:
                        summary, step_accuracy, step_loss, step_ponder, step_ponders_tensor \
                            = sess.run([merged, accuracy, loss, ponder_mean, ponders_tensor],
                                       feed_dict={x: batch[0], y: batch[1]})
                        ponders_list.append(step_ponders_tensor)
                        stack = np.stack(ponders_list, axis=0)
                        np.savetxt('ponders_addition.txt', stack)
                    else:
                        summary, step_accuracy, step_loss, step_ponder \
                            = sess.run([merged, accuracy, loss, ponder_mean],
                                       feed_dict={x: batch[0], y: batch[1]})
                    if args.print_results:
                        loop.set_postfix(Loss='{:0.3f}'.format(step_loss),
                                         Accuracy='{:0.3f}'.format(step_accuracy),
                                         Ponder='{:0.3f}'.format(step_ponder))
                else:
                    summary, step_accuracy, step_loss = sess.run([merged, accuracy, loss],
                                                                 feed_dict={
                                                                     x: batch[0], y: batch[1]})
                    if args.print_results:
                        loop.set_postfix(Loss='{:0.3f}'.format(step_loss),
                                         Accuracy='{:0.3f}'.format(step_accuracy))
                if args.log:
                    writer.add_summary(summary, i)
            train_step.run(feed_dict={x: batch[0], y: batch[1]})

    if args.return_ponders:
        stack = np.stack(ponders_list, axis=0)
        np.savetxt('ponders_addition.txt', stack)
def main():
    args = parser.parse_args()

    input_size = 10 * args.total_digits
    output_size = 11 * (args.total_digits + 1)
    batch_size = args.batch_size
    hidden_size = args.hidden_size

    # Placeholders for inputs.
    x = tf.placeholder(
        tf.float32,
        [batch_size, args.ponder * args.sequence_length, 1 + input_size])
    y = tf.placeholder(
        tf.int64,
        [batch_size * (args.sequence_length - 1) * (args.total_digits + 1)])

    rnn = LSTMBlockCell(hidden_size)
    output, final_state = tf.nn.dynamic_rnn(rnn, x, dtype=tf.float32)

    output = output[:, args.ponder - 1::args.ponder, :]
    output = tf.reshape(output[:, 1:, :], [-1, hidden_size])
    softmax_w = tf.get_variable("softmax_w", [hidden_size, output_size])
    softmax_b = tf.get_variable("softmax_b", [output_size])
    logits = tf.reshape(tf.matmul(output, softmax_w) + softmax_b, [-1, 11])

    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                          logits=logits)

    loss = tf.reduce_mean(tf.reshape(loss, [batch_size, -1]), axis=1)

    loss = tf.reduce_mean(loss)
    tf.summary.scalar('Loss', loss)

    train_step = tf.train.AdamOptimizer(args.lr).minimize(loss)

    predicted = tf.argmax(logits, 1)
    target = tf.cast(y, tf.int64)
    correct_sequences = tf.cast(
        tf.reduce_all(tf.reshape(tf.equal(predicted, target), [
            args.batch_size,
            (args.sequence_length - 1) * (args.total_digits + 1)
        ]),
                      axis=1), tf.float32)
    accuracy = tf.reduce_mean(correct_sequences)
    tf.summary.scalar('Accuracy', accuracy)

    merged = tf.summary.merge_all()
    logdir = './logs/addition_test/LR={}_Pond={}'.format(args.lr, args.ponder)
    while os.path.isdir(logdir):
        logdir += '_'
    if args.log:
        writer = tf.summary.FileWriter(logdir)

    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=args.vram_fraction)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        sess.run(tf.global_variables_initializer())
        loop = trange(args.steps)
        for i in loop:
            batch = generate(args)

            if i % args.log_interval == 0:
                summary, step_accuracy, step_loss = sess.run(
                    [merged, accuracy, loss],
                    feed_dict={
                        x: batch[0],
                        y: batch[1]
                    })
                if args.print_results:
                    loop.set_postfix(Loss='{:0.3f}'.format(step_loss),
                                     Accuracy='{:0.3f}'.format(step_accuracy))
                if args.log:
                    writer.add_summary(summary, i)
            train_step.run(feed_dict={x: batch[0], y: batch[1]})
Beispiel #19
0
    def initialize(self, text_inputs, input_lengths, speaker_ids, mel_targets=None, linear_targets=None):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          text_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          speaker_ids: int32 Tensor containing ids of specific speakers
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference'):
            is_training = linear_targets is not None
            batch_size = tf.shape(text_inputs)[0]
            hp = self._hparams
            vocab_size = len(symbols)
            embedded_inputs = embedding(text_inputs, vocab_size, hp.embedding_dim)  # [N, T_in, embd_size]

            # extract speaker embedding if multi-speaker
            with tf.variable_scope('speaker'):
                if hp.num_speakers > 1:
                    speaker_embedding = tf.get_variable('speaker_embed',
                                                        shape=(hp.num_speakers, hp.speaker_embed_dim),
                                                        dtype=tf.float32)
                    # TODO: what about special initializer=tf.truncated_normal_initializer(stddev=0.5)?
                    speaker_embd = tf.nn.embedding_lookup(speaker_embedding, speaker_ids)
                else:
                    speaker_embd = None

            # Encoder
            encoder_outputs = conv_and_lstm(
                    embedded_inputs,
                    input_lengths,
                    conv_layers=hp.encoder_conv_layers,
                    conv_width=hp.encoder_conv_width,
                    conv_channels=hp.encoder_conv_channels,
                    lstm_units=hp.encoder_lstm_units,
                    is_training=is_training,
                    scope='encoder')  # [N, T_in, 512]

            # Attention Mechanism
            attention_cell = attention_decoder(encoder_outputs, hp.attention_dim, input_lengths, is_training,
                                               speaker_embd=speaker_embd, attention_type="location_sensitive")

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                attention_cell,
                LSTMBlockCell(hp.decoder_lstm_units),
                LSTMBlockCell(hp.decoder_lstm_units)], state_is_tuple=True)  # [N, T_in, 1024]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(text_inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

            (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                    BasicDecoder(output_cell, helper, decoder_init_state),
                    maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            decoder_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Postnet: predicts a residual
            postnet_outputs = postnet(
                    decoder_outputs,
                    layers=hp.postnet_conv_layers,
                    conv_width=hp.postnet_conv_width,
                    channels=hp.postnet_conv_channels,
                    is_training=is_training)
            mel_outputs = decoder_outputs + postnet_outputs

            # Convert to linear using a similar architecture as the encoder:
            expand_outputs = conv_and_lstm(
                    mel_outputs,
                    None,
                    conv_layers=hp.expand_conv_layers,
                    conv_width=hp.expand_conv_width,
                    conv_channels=hp.expand_conv_channels,
                    lstm_units=hp.expand_lstm_units,
                    is_training=is_training,
                    scope='expand')  # [N, T_in, 512]
            linear_outputs = tf.layers.dense(expand_outputs, hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            # TODO: seems not to work?!?
            alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = text_inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  concat attn & out:       %d' % attention_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % postnet_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Beispiel #20
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False,
                   locked_alignments=None):
        """
        Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
        """
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments

        if locked_alignments_ is not None:
            if np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            encoder_outputs = conv_and_lstm(  # [N, T_in, 2*encoder_gru_units=512]
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                lstm_units_unidirectional=hp.encoder_gru_units,
                is_training=is_training,
                scope='encoder',
            )

            # Attention
            attention_cell = AttentionWrapper(  # [N, T_in, attention_depth=256]
                DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth),
                                     is_training, hp.prenet_depths),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a
            # 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    concat_cell,
                    LSTMBlockCell(hp.decoder_gru_units),
                    LSTMBlockCell(hp.decoder_gru_units)
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=1024]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)

            if is_training or gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_intermediates = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Add Post-Processing Conv and LSTM layer:
            expand_outputs = conv_and_lstm(  # [N, T_in, 2*expand_gru_units=512]
                pml_intermediates,
                None,
                conv_layers=hp.expand_conv_layers,
                conv_width=hp.expand_conv_width,
                conv_channels=hp.expand_conv_channels,
                lstm_units_unidirectional=hp.expand_gru_units,
                is_training=is_training,
                scope='expand',
            )

            pml_outputs = tf.layers.dense(expand_outputs,
                                          hp.pml_dimension)  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_intermediates = pml_intermediates
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  Train mode:              {}'.format(is_training))
            log('  GTA mode:                {}'.format(is_training))
            log('  Embedding:               {}'.format(
                embedded_inputs.shape[-1]))
            log('  Encoder out:             {}'.format(
                encoder_outputs.shape[-1]))
            log('  Attention out:           {}'.format(
                attention_cell.output_size))
            log('  Concat attn & out:       {}'.format(
                concat_cell.output_size))
            log('  Decoder cell out:        {}'.format(
                decoder_cell.output_size))
            log('  Decoder out ({} frames):  {}'.format(
                hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  Decoder out (1 frame):   {}'.format(
                pml_intermediates.shape[-1]))
            log('  Expand out:              {}'.format(
                expand_outputs.shape[-1]))
            log('  PML out:                 {}'.format(pml_outputs.shape[-1]))