def embedding(self, x, is_training=False): """ :param x: shape=(n, t, n_mels) :return: embedding. shape=(n, e) """ # frame-level embedding x = tf.layers.dense(x, units=self.hidden_units, activation=tf.nn.relu) # (n, t, h) out = conv1d_banks(x, K=self.num_banks, num_units=self.hidden_units, norm_type=self.norm_type, is_training=is_training) # (n, t, k * h) out = tf.layers.max_pooling1d(out, 2, 1, padding="same") # (n, t, k * h) out = conv1d(out, self.hidden_units, 3, scope="conv1d_1") # (n, t, h) out = normalize(out, type=self.norm_type, is_training=is_training, activation_fn=tf.nn.relu) out = conv1d(out, self.hidden_units, 3, scope="conv1d_2") # (n, t, h) out += x # (n, t, h) # residual connections for i in range(self.num_highway): out = highwaynet(out, num_units=self.hidden_units, scope='highwaynet_{}'.format(i)) # (n, t, h) out = gru(out, self.hidden_units, False) # (n, t, h) # take the last output out = out[..., -1] # (n, h) # embedding out = tf.layers.dense(out, self.num_classes, name='projection') # (n, e) out = tf.identity(out, name="embedding") return out
def decode2(inputs, is_training=True, scope="decoder2", reuse=None): ''' Args: inputs: A 3d tensor with shape of [N, T', C'], where C'=hp.n_mels*hp.r, dtype of float32. Log magnitude spectrogram of sound files. is_training: Whether or not the layer is in training mode. scope: Optional scope for `variable_scope` reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns Predicted magnitude spectrogram tensor with shape of [N, T', C''], where C'' = (1+hp.n_fft//2)*hp.r. ''' with tf.variable_scope(scope, reuse=reuse): # Decoder pre-net prenet_out = mod.prenet(inputs, is_training=is_training) # (N, T'', E/2) # Decoder Post-processing net = CBHG ## Conv1D bank dec = mod.conv1d_banks(prenet_out, K=hp.decoder_num_banks, is_training=is_training) # (N, T', E*K/2) ## Max pooling dec = tf.layers.max_pooling1d(dec, 2, 1, padding="same") # (N, T', E*K/2) ## Conv1D projections dec = mod.conv1d(dec, hp.embed_size, 3, scope="conv1d_1") # (N, T', E) dec = mod.normalize(dec, type=hp.norm_type, is_training=is_training, activation_fn=tf.nn.relu, scope="norm1") dec = mod.conv1d(dec, hp.embed_size // 2, 3, scope="conv1d_2") # (N, T', E/2) dec = mod.normalize(dec, type=hp.norm_type, is_training=is_training, activation_fn=None, scope="norm2") dec += prenet_out ## Highway Nets for i in range(4): dec = mod.highwaynet( dec, num_units=hp.embed_size // 2, scope='highwaynet_{}'.format(i)) # (N, T, E/2) ## Bidirectional GRU dec = mod.gru(dec, hp.embed_size // 2, True) # (N, T', E) # Outputs => (N, T', (1+hp.n_fft//2)*hp.r) out_dim = (1 + hp.n_fft // 2) * hp.r outputs = tf.layers.dense(dec, out_dim) return outputs
def dilated_convolution(input, Local_condition, Global_condition, filter_width, dilation_rate, output_width, index, dilation, reuse): #dilated convolution conv = conv1d(input, hp.Q * 2, filter_width, rate=dilation_rate, padding="causal", scope='conv_{}_{}'.format(index, dilation), reuse=reuse) local_cond = conv1d(Local_condition, hp.Q * 2, 1, rate=1, padding="SAME", scope='local_cond_{}_{}'.format(index, dilation), reuse=reuse) local_cond = local_cond[:, hp.size**(index + 1) - 1:, :] global_cond = conv1d(Global_condition, hp.Q * 2, 1, rate=1, padding="SAME", scope='global_cond_{}_{}'.format(index, dilation), reuse=reuse) conv_filter, conv_gate = tf.split(conv, 2, -1) local_cond_filter, local_cond_gate = tf.split(local_cond, 2, -1) global_cond_filter, global_cond_gate = tf.split(global_cond, 2, -1) conv_filter = conv_filter + local_cond_filter + global_cond_filter #broadcast conv_gate = conv_gate + local_cond_gate + global_cond_gate #broadcast out = tf.tanh(conv_filter) + tf.sigmoid(conv_gate) transformed = conv1d(out, filters=hp.Q, padding="SAME", scope='transformed_{}_{}'.format(index, dilation), onebyone=True, reuse=reuse) _, x, __ = out.get_shape().as_list() skip_cut = x - output_width out_skip = tf.slice(out, [0, skip_cut, 0], [-1, -1, -1], name='out_skip') out_skip = tf.reshape(out_skip, [hp.batch_size, output_width, hp.Q]) skip_contribution = conv1d(out_skip, filters=hp.Q, padding="SAME", scope='skip_contribution_{}_{}'.format( index, dilation), onebyone=True, reuse=reuse) transformed_cut = transformed.get_shape().as_list()[1] input_cut = input.get_shape().as_list()[1] - transformed_cut input_batch = tf.slice(input, [0, input_cut, 0], [-1, -1, -1]) input_batch = tf.reshape(input_batch, [hp.batch_size, transformed_cut, hp.Q]) return skip_contribution, input_batch + transformed
def encode(inputs, is_training = True, scope = 'encoder', reuse = None): with tf.variable_scope(scope, reuse = reuse): prenet_out = prenet(inputs, scope = 'prenet', is_training = is_training) enc = conv1d_banks( prenet_out, K = encoder_num_banks, is_training = is_training ) enc = tf.layers.max_pooling1d(enc, 2, 1, padding = 'same') enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_1') enc = normalize_in(enc, activation_fn = tf.nn.relu) enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_2') enc = normalize_in(enc, activation_fn = tf.nn.relu) enc += prenet_out for i in range(num_highway_blocks): enc = highwaynet( enc, units = embed_size // 2, scope = 'highwaynet_%d' % (i) ) memory = gru(enc, embed_size // 2, True) return memory
def decoder(decoder_inputs, speaker_emb, z_q): '''Wavenet decoder. Args: decoder_inputs: [B, T, 1]. speaker_emb: [B, len(speakser)]. One-hot. Global condition. z_q: [B, T', D]. Local condition. ''' with tf.variable_scope("decoder"): # Prenet z = conv1d(decoder_inputs, hp.num_units, activation_fn=tf.tanh, padding="causal", bn=True, scope='conv_in') # (B, T, H) # Residual blocks skip = 0 # skip connections for i in range(hp.num_blocks): for r in hp.dilations: z, s = residual_block(z, size=hp.size, rate=r, speaker_emb=speaker_emb, z_q=z_q, scope="res_block_{}_{}".format(i, r)) skip += s # Postnet skip = tf.nn.relu(skip) skip = conv1d(skip, padding="causal", activation_fn=tf.nn.relu, bn=True, scope="one_by_one_1") # (B, T, H) y = conv1d(skip, filters=hp.Q, padding="causal", scope="one_by_one_2") # (B, T, Q) wave logits. return y
def encoder(x): ''' Args: x: waveform. [B, T, Q] Returns: z_e: encoded variable. [B, T', D] ''' with tf.variable_scope("encoder"): for i in range(hp.encoder_layers): x = tf.pad(x, [[0, 0], [1, 1], [0, 0]]) x = conv1d(x, filters=hp.D, size=hp.winsize, strides=hp.stride, padding="valid", bn=True, activation_fn=tf.nn.relu if i < hp.encoder_layers - 1 else None, scope="conv1d_{}".format(i)) z_e = x return z_e
def init_inference(self, config, is_training=False): num_banks = config['num_banks'] hidden_units = config['hidden_units'] num_highway = config['num_highway'] norm_type = config['norm_type'] batch_size = config['batch_size'] num_rnn_layer = config['num_rnn_layer'] self._input_dim = input_dim = config['input_dim'] self._output_dim = output_dim = config['alphabet_size'] self._inputs = tf.placeholder(tf.float32, [batch_size, None, input_dim]) self._seq_lens = tf.placeholder(tf.int32, shape=batch_size) self._out_lens = self._seq_lens # TODO, awni, for now on the client to remember to initialize these. self._mean = tf.get_variable("mean", shape=input_dim, trainable=False) self._std = tf.get_variable("std", shape=input_dim, trainable=False) std_inputs = (self._inputs - self._mean) / self._std x = conv1d(self._inputs, hidden_units, 1, scope="conv1d") out = conv1d_banks(x, K=num_banks, num_units=hidden_units, norm_type=norm_type, is_training=is_training) # (n, t, k * h) out = tf.layers.max_pooling1d(out, 2, 1, padding="same") # (n, t, k * h) out = conv1d(out, hidden_units, 3, scope="conv1d_1") # (n, t, h) out = normalize(out, type=norm_type, is_training=is_training, activation_fn=tf.nn.relu) out = conv1d(out, hidden_units, 3, scope="conv1d_2") # (n, t, h) out += x # (n, t, h) # residual connections for i in range(num_highway): out = highwaynet(out, num_units=hidden_units, scope='highwaynet_{}'.format(i)) # (n, t, h) rnn_out, state, initial_state = gru( out, hidden_units, False, seqlens=self._seq_lens, num_layers=num_rnn_layer, is_training=is_training) # (n, t, h) self._initial_state = initial_state self._rnn_state = state rnn_out = tf.transpose(rnn_out, [1, 0, 2]) # Collapse time and batch dims pre softmax. rnn_out = tf.reshape(rnn_out, (-1, hidden_units)) logits, probas = _add_softmax_linear( rnn_out, hidden_units, output_dim, initializer=tf.contrib.layers.xavier_initializer()) # Reshape to time-major. self._logits = tf.reshape(logits, (-1, batch_size, output_dim)) self._probas = tf.reshape(probas, (-1, batch_size, output_dim)) self._init_inference = True
def decoder(decoder_inputs, speaker_emb, z_q, is_training=True): ''' Wavenet decoder. Args: decoder_inputs: raw wav form [B, T, 1]. speaker_emb: [B, len(speaker)]. One-hot. Global condition. -->speaker_emb:[B,ivec_size] speaker ivector z_q: [B, T', D]. Local condition. is_training: tell model whether it is in training mode Return: output: [B,T-receptive_field+1,Q] ''' with tf.variable_scope("decoder"): #multiples = hp.stride**hp.encoder_layers receptive_field = hp.dilations[-1] * hp.size output_width = decoder_inputs.get_shape().as_list( )[1] - receptive_field + 1 # raw wav form(B,T,1) to (B,T,Q) #decoder_inputs=tf.reshape(decoder_inputs,[hp.batch_size,hp.T,hp.Q]) # local condition (B,T',D) to (B,T,Q) ''' B,t,D = z_q.get_shape().as_list() z_out = tf.reshape(z_q,[1,-1,D]) for i in range(0,hp.encoder_layers): z_out = tf.concat((z_out,z_out),axis=0) z_out = tf.transpose(z_out,perm=[1,0,2]) z_out = tf.reshape(z_out,[B,multiples*t,D]) ''' reuse = None if is_training is True: z_out = transposed_conv(z_q) else: reuse = tf.AUTO_REUSE z_out = z_q #z_out is now (B,T,Q) #global conditioning (B,L) to (B,1,Q) speaker_emb = tf.expand_dims(speaker_emb, 1) #(B,1,L) gc = speaker_emb outputs = [] for index, dilation in enumerate(hp.dilations): out, decoder_inputs = dilated_convolution(decoder_inputs, z_out, gc, hp.size, dilation, output_width, index, dilation, reuse) outputs.append(out) #postnet total = sum(outputs) transformed1 = tf.nn.relu(total) conv1 = conv1d(transformed1, hp.Q, scope='transformed1', onebyone=True, reuse=reuse) transformed2 = tf.nn.relu(conv1) conv2 = conv1d(transformed2, hp.Q, scope='transformed2', onebyone=True, reuse=reuse) return conv2
def encode(inputs, is_training=True, scope="encoder", reuse=None): ''' Args: inputs: A 2d tensor with shape of [N, T], dtype of int32. N: batch_size T: real length seqlens: A 1d tensor with shape of [N,], dtype of int32. masks: A 3d tensor with shape of [N, T, 1], dtype of float32. is_training: Whether or not the layer is in training mode. scope: Optional scope for `variable_scope` reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: E is the spectrogram filter N A collection of Hidden vectors, whose shape is (N, T, E). N seqs, each with T characters, and each of them encoded to E dimension latent representation ''' with tf.variable_scope(scope, reuse=reuse): # Load vocabulary #char2idx, idx2char = load_vocab() # Character Embedding N seqs #inputs = mod.embed(inputs, len(char2idx), hp.embed_size) # (N, T, E) shape=(32, ?, 256) # Encoder pre-net: dense(E)--dropout--dense(E/2)--dropout #ipdb.set_trace() inputs = mod.pre_spectro(inputs, is_training=is_training) # (N, T, E) prenet_out = mod.prenet(inputs, is_training=is_training) # (N, T, E/2) # Encoder CBHG ## Conv1D bank enc = mod.conv1d_banks(prenet_out, K=hp.encoder_num_banks, is_training=is_training) # (N, T, K * E / 2) ### Max pooling enc = tf.layers.max_pooling1d(enc, 2, 1, padding="same") # (N, T, K * E / 2) ### Conv1D projections enc = mod.conv1d(enc, hp.embed_size // 2, 3, scope="conv1d_1") # (N, T, E/2) enc = mod.normalize(enc, type=hp.norm_type, is_training=is_training, activation_fn=tf.nn.relu, scope="norm1") enc = mod.conv1d(enc, hp.embed_size // 2, 3, scope="conv1d_2") # (N, T, E/2) enc = mod.normalize(enc, type=hp.norm_type, is_training=is_training, activation_fn=None, scope="norm2") enc += prenet_out # (N, T, E/2) # residual connections ### Highway Nets for i in range(hp.num_highwaynet_blocks): enc = mod.highwaynet( enc, num_units=hp.embed_size // 2, scope='highwaynet_{}'.format(i)) # (N, T, E/2) ### Bidirectional GRU---apply nonlineararity memory = mod.gru( enc, hp.embed_size // 2, False ) # (N, T, E) what the network represent the input text input return memory