def decode2(inputs, is_training=True, scope="decoder2", reuse=None): ''' Args: inputs: A 3d tensor with shape of [N, T', C'], where C'=hp.n_mels*hp.r, dtype of float32. Log magnitude spectrogram of sound files. is_training: Whether or not the layer is in training mode. scope: Optional scope for `variable_scope` reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns Predicted magnitude spectrogram tensor with shape of [N, T', C''], where C'' = (1+hp.n_fft//2)*hp.r. ''' with tf.variable_scope(scope, reuse=reuse): # Decoder pre-net prenet_out = mod.prenet(inputs, is_training=is_training) # (N, T'', E/2) # Decoder Post-processing net = CBHG ## Conv1D bank dec = mod.conv1d_banks(prenet_out, K=hp.decoder_num_banks, is_training=is_training) # (N, T', E*K/2) ## Max pooling dec = tf.layers.max_pooling1d(dec, 2, 1, padding="same") # (N, T', E*K/2) ## Conv1D projections dec = mod.conv1d(dec, hp.embed_size, 3, scope="conv1d_1") # (N, T', E) dec = mod.normalize(dec, type=hp.norm_type, is_training=is_training, activation_fn=tf.nn.relu, scope="norm1") dec = mod.conv1d(dec, hp.embed_size // 2, 3, scope="conv1d_2") # (N, T', E/2) dec = mod.normalize(dec, type=hp.norm_type, is_training=is_training, activation_fn=None, scope="norm2") dec += prenet_out ## Highway Nets for i in range(4): dec = mod.highwaynet( dec, num_units=hp.embed_size // 2, scope='highwaynet_{}'.format(i)) # (N, T, E/2) ## Bidirectional GRU dec = mod.gru(dec, hp.embed_size // 2, True) # (N, T', E) # Outputs => (N, T', (1+hp.n_fft//2)*hp.r) out_dim = (1 + hp.n_fft // 2) * hp.r outputs = tf.layers.dense(dec, out_dim) return outputs
def __init__(self, sess, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2, seq_len=256, num_blocks=2): super(Model_SAKmeans, self).__init__(n_mid, embedding_dim, hidden_size, batch_size, seq_len, flag="Model_SAKmeans") with tf.variable_scope("Model_SAKmeans", reuse=tf.AUTO_REUSE) as scope: # Positional Encoding t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0) self.mid_his_batch_embedded += t # Dropout self.seq = tf.layers.dropout(self.mid_his_batch_embedded, rate=dropout_rate, training=tf.convert_to_tensor(True)) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # Build blocks for i in range(num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=hidden_size, num_heads=num_interest, dropout_rate=dropout_rate, is_training=True, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size], dropout_rate=dropout_rate, is_training=True) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # (b, seq_len, dim) self.seq = normalize(self.seq) num_heads = num_interest self.user_eb = getKVector(sess, self.seq, num_heads) self.dim = embedding_dim item_list_emb = tf.reshape(self.seq, [-1, seq_len, embedding_dim]) # item_list_emb = [-1, seq_len, embedding_dim] # atten: (batch, num_heads, dim) * (batch, dim, 1) = (batch, num_heads, 1) atten = tf.matmul(self.user_eb, tf.reshape(self.item_eb, [get_shape(item_list_emb)[0], self.dim, 1])) atten = tf.nn.softmax(tf.pow(tf.reshape(atten, [get_shape(item_list_emb)[0], num_heads]), 1)) # 找出与target item最相似的用户兴趣向量 readout = tf.gather(tf.reshape(self.user_eb, [-1, self.dim]), tf.argmax(atten, axis=1, output_type=tf.int32) + tf.range( tf.shape(item_list_emb)[0]) * num_heads) self.build_sampled_softmax_loss(self.item_eb, readout)
def embedding(self, x, is_training=False): """ :param x: shape=(n, t, n_mels) :return: embedding. shape=(n, e) """ # frame-level embedding x = tf.layers.dense(x, units=self.hidden_units, activation=tf.nn.relu) # (n, t, h) out = conv1d_banks(x, K=self.num_banks, num_units=self.hidden_units, norm_type=self.norm_type, is_training=is_training) # (n, t, k * h) out = tf.layers.max_pooling1d(out, 2, 1, padding="same") # (n, t, k * h) out = conv1d(out, self.hidden_units, 3, scope="conv1d_1") # (n, t, h) out = normalize(out, type=self.norm_type, is_training=is_training, activation_fn=tf.nn.relu) out = conv1d(out, self.hidden_units, 3, scope="conv1d_2") # (n, t, h) out += x # (n, t, h) # residual connections for i in range(self.num_highway): out = highwaynet(out, num_units=self.hidden_units, scope='highwaynet_{}'.format(i)) # (n, t, h) out = gru(out, self.hidden_units, False) # (n, t, h) # take the last output out = out[..., -1] # (n, h) # embedding out = tf.layers.dense(out, self.num_classes, name='projection') # (n, e) out = tf.identity(out, name="embedding") return out
def __init__(self, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2, seq_len=256, num_blocks=2): super(Model_SASRec, self).__init__(n_mid, embedding_dim, hidden_size, batch_size, seq_len, flag="Model_SASRec") with tf.variable_scope("Model_SASRec", reuse=tf.AUTO_REUSE) as scope: # Positional Encoding t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0) self.mid_his_batch_embedded += t # Dropout self.seq = tf.layers.dropout(self.mid_his_batch_embedded, rate=dropout_rate, training=tf.convert_to_tensor(True)) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # Build blocks for i in range(num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=hidden_size, num_heads=num_interest, dropout_rate=dropout_rate, is_training=True, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size], dropout_rate=dropout_rate, is_training=True) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # (b, seq_len, dim) self.seq = normalize(self.seq) self.sum_pooling = tf.reduce_sum(self.seq, 1) fc1 = tf.layers.dense(self.sum_pooling, 1024, activation=tf.nn.relu) fc2 = tf.layers.dense(fc1, 512, activation=tf.nn.relu) fc3 = tf.layers.dense(fc2, 256, activation=tf.nn.relu) self.user_eb = tf.layers.dense(fc3, hidden_size, activation=tf.nn.relu) self.build_sampled_softmax_loss(self.item_eb, self.user_eb)
def network(self, ppgs, is_training): # Pre-net prenet_out = prenet( ppgs, num_units=[hp.train2.hidden_units, hp.train2.hidden_units // 2], dropout_rate=hp.train2.dropout_rate, is_training=is_training) # (N, T, E/2) # CBHG1: mel-scale # pred_mel = cbhg(prenet_out, hp.train2.num_banks, hp.train2.hidden_units // 2, # hp.train2.num_highway_blocks, hp.train2.norm_type, is_training, # scope="cbhg_mel") # pred_mel = tf.layers.dense(pred_mel, self.y_mel.shape[-1]) # (N, T, n_mels) pred_mel = prenet_out # CBHG2: linear-scale out = tf.layers.dense(pred_mel, hp.train2.hidden_units // 2) # (N, T, n_mels) out = cbhg(out, hp.train2.num_banks, hp.train2.hidden_units // 2, hp.train2.num_highway_blocks, hp.train2.norm_type, is_training, scope="cbhg_linear") _, n_timesteps, n_bins = self.y_spec.get_shape().as_list() n_units = n_bins * hp.train2.n_mixtures out = tf.layers.dense(out, n_units * 3, bias_initializer=tf.random_uniform_initializer( minval=-3., maxval=3.)) mu = tf.nn.sigmoid(out[..., :n_units]) mu = tf.reshape( mu, shape=(-1, n_timesteps, n_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) log_var = tf.maximum(out[..., n_units:2 * n_units], -7.0) log_var = tf.reshape( log_var, shape=(-1, n_timesteps, n_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) log_pi = tf.reshape( out[..., 2 * n_units:3 * n_units], shape=(-1, n_timesteps, n_bins, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) log_pi = normalize(log_pi, type='ins', is_training=get_current_tower_context().is_training, scope='normalize_pi') log_pi = tf.nn.log_softmax(log_pi) return mu, log_var, log_pi
def _upsample_cond(self, melspec, is_training, strides): assert (np.prod(np.array(strides)) == hp.signal.hop_length) # option1) Upsample melspec to fit to shape of waveform. (n, t_mel, n_mel) => (n, t, h) if hp.model.cond_upsample_method == 'transposed_conv': cond = tf.expand_dims(melspec, 1) length = self.t_mel input_channels = hp.signal.n_mels for i, stride in enumerate(strides): w = tf.get_variable('transposed_conv_{}_weights'.format(i), shape=(1, stride, hp.model.condition_channels, input_channels)) input_channels = hp.model.condition_channels length *= stride cond = tf.nn.conv2d_transpose( cond, w, output_shape=(self.batch_size, 1, length, hp.model.condition_channels), strides=[1, 1, stride, 1]) cond = tf.nn.relu(cond) cond = normalize(cond, method=hp.model.normalize_cond, is_training=is_training, name='normalize_transposed_conv_{}'.format(i)) cond = tf.squeeze(cond, 1) cond = cond[:, hp.signal.hop_length // 2:-hp.signal.hop_length // 2, :] # (n, t, h) # option2) just copy value and expand dim of time step elif hp.model.cond_upsample_method == 'repeat': w = tf.get_variable( 'dense', [1, hp.signal.n_mels, hp.model.condition_channels]) cond = tf.nn.conv1d(melspec, w, stride=1, padding="SAME") cond = tf.nn.relu(cond) cond = tf.reshape(tf.tile(cond, [1, 1, hp.signal.hop_length]), shape=[ -1, self.t_mel * hp.signal.hop_length, hp.model.condition_channels ]) cond = cond[:, hp.signal.hop_length // 2:-hp.signal.hop_length // 2, :] else: cond = None return cond
def __call__(self, wav, melspec, is_training, name='iaf_vocoder'): # network with tf.variable_scope(name, reuse=tf.AUTO_REUSE): with tf.variable_scope('cond'): condition = self._upsample_cond(melspec, is_training=is_training, strides=[4, 4, 5]) # (n, t, h) if hp.model.normalize_cond: with tf.variable_scope('normalize'): condition = normalize(condition, method=hp.model.normalize_cond, is_training=is_training) # Sample from logistic dist. logstic_dist = tf.contrib.distributions.Logistic(loc=0., scale=1.) input = logstic_dist.sample([self.batch_size, self.length, 1]) for i in range(hp.model.n_iaf): with tf.variable_scope('iaf{}'.format(i)): scaler = WaveNet( batch_size=self.batch_size, dilations=hp.model.dilations[i], filter_width=hp.model.filter_width, residual_channels=hp.model.residual_channels, dilation_channels=hp.model.dilation_channels, quantization_channels=1, skip_channels=hp.model.skip_channels, use_biases=hp.model.use_biases, condition_channels=hp.model.condition_channels, use_skip_connection=hp.model.use_skip_connection, is_training=is_training, name='scalar', normalize=hp.model.normalize_wavenet, ) shifter = WaveNet( batch_size=self.batch_size, dilations=hp.model.dilations[i], filter_width=hp.model.filter_width, residual_channels=hp.model.residual_channels, dilation_channels=hp.model.dilation_channels, quantization_channels=1, skip_channels=hp.model.skip_channels, use_biases=hp.model.use_biases, condition_channels=hp.model.condition_channels, use_skip_connection=hp.model.use_skip_connection, is_training=is_training, name='shifter', normalize=hp.model.normalize_wavenet, ) iaf = LinearIAFLayer(batch_size=hp.train.batch_size, scaler=scaler, shifter=shifter) input = iaf(input, condition if hp.model.condition_all_iaf or i is 0 else None) # (n, t, h) # normalization input = normalize(input, method=hp.model.normalize, is_training=is_training, name='normalize{}'.format(i)) if hp.train.use_ema: self.ema = tf.train.ExponentialMovingAverage( decay=hp.train.ema_decay) var_class = tf.trainable_variables('iaf_vocoder') ema_op = self.ema.apply(var_class) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op) return input
def init_inference(self, config, is_training=False): num_banks = config['num_banks'] hidden_units = config['hidden_units'] num_highway = config['num_highway'] norm_type = config['norm_type'] batch_size = config['batch_size'] num_rnn_layer = config['num_rnn_layer'] self._input_dim = input_dim = config['input_dim'] self._output_dim = output_dim = config['alphabet_size'] self._inputs = tf.placeholder(tf.float32, [batch_size, None, input_dim]) self._seq_lens = tf.placeholder(tf.int32, shape=batch_size) self._out_lens = self._seq_lens # TODO, awni, for now on the client to remember to initialize these. self._mean = tf.get_variable("mean", shape=input_dim, trainable=False) self._std = tf.get_variable("std", shape=input_dim, trainable=False) std_inputs = (self._inputs - self._mean) / self._std x = conv1d(self._inputs, hidden_units, 1, scope="conv1d") out = conv1d_banks(x, K=num_banks, num_units=hidden_units, norm_type=norm_type, is_training=is_training) # (n, t, k * h) out = tf.layers.max_pooling1d(out, 2, 1, padding="same") # (n, t, k * h) out = conv1d(out, hidden_units, 3, scope="conv1d_1") # (n, t, h) out = normalize(out, type=norm_type, is_training=is_training, activation_fn=tf.nn.relu) out = conv1d(out, hidden_units, 3, scope="conv1d_2") # (n, t, h) out += x # (n, t, h) # residual connections for i in range(num_highway): out = highwaynet(out, num_units=hidden_units, scope='highwaynet_{}'.format(i)) # (n, t, h) rnn_out, state, initial_state = gru( out, hidden_units, False, seqlens=self._seq_lens, num_layers=num_rnn_layer, is_training=is_training) # (n, t, h) self._initial_state = initial_state self._rnn_state = state rnn_out = tf.transpose(rnn_out, [1, 0, 2]) # Collapse time and batch dims pre softmax. rnn_out = tf.reshape(rnn_out, (-1, hidden_units)) logits, probas = _add_softmax_linear( rnn_out, hidden_units, output_dim, initializer=tf.contrib.layers.xavier_initializer()) # Reshape to time-major. self._logits = tf.reshape(logits, (-1, batch_size, output_dim)) self._probas = tf.reshape(probas, (-1, batch_size, output_dim)) self._init_inference = True
def __init__(self, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2, seq_len=256, num_blocks=2): super(Model_MSARec, self).__init__(n_mid, embedding_dim, hidden_size, batch_size, seq_len, flag="MSARec") with tf.variable_scope("MSARec", reuse=tf.AUTO_REUSE) as scope: # Positional Encoding t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0) self.mid_his_batch_embedded += t # Dropout self.seq = tf.layers.dropout(self.mid_his_batch_embedded, rate=dropout_rate, training=tf.convert_to_tensor(True)) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # Build blocks for i in range(num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=hidden_size, num_heads=num_interest, dropout_rate=dropout_rate, is_training=True, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size], dropout_rate=dropout_rate, is_training=True) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # (b, seq_len, dim) self.seq = normalize(self.seq) self.dim = embedding_dim item_list_emb = tf.reshape(self.seq, [-1, seq_len, embedding_dim]) # t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0) # item_list_add_pos = item_list_emb + t num_heads = num_interest fc1 = tf.layers.dense(item_list_emb, hidden_size * 4, activation=tf.nn.relu) fc2 = tf.layers.dense(fc1, num_heads, activation=tf.nn.tanh) # (b, num_heads, sql_len) fc2 = tf.transpose(fc2, [0, 2, 1]) interest_emb = tf.layers.dense(fc2, embedding_dim, activation=tf.nn.relu) # with tf.variable_scope("multi_interest", reuse=tf.AUTO_REUSE) as scope: # # item_list_add_pos: (b, seq_len, embedding_dim) # # item_hidden: (b, sql_len, hidden_size * 4) # # item_hidden = tf.layers.dense(item_list_add_pos, hidden_size * 4, activation=tf.nn.tanh) # item_hidden = tf.layers.dense(item_list_emb, hidden_size * 4, activation=tf.nn.tanh) # # item_att_w: (b, sql_len, num_heads) # item_att_w = tf.layers.dense(item_hidden, num_heads, activation=tf.nn.tanh) # # item_att_w: (b, num_heads, sql_len) # item_att_w = tf.transpose(item_att_w, [0, 2, 1]) # # # atten_mask: (b, num_heads, sql_len) # atten_mask = tf.tile(tf.expand_dims(self.mask, axis=1), [1, num_heads, 1]) # paddings = tf.ones_like(atten_mask) * (-2 ** 32 + 1) # # # 对于填充的位置赋值极小值 # item_att_w = tf.where(tf.equal(atten_mask, 0), paddings, item_att_w) # item_att_w = tf.nn.softmax(item_att_w) # # # item_att_w [batch, num_heads, seq_len] # # item_list_emb [batch, seq_len, embedding_dim] # # interest_emb (batch, num_heads, embedding_dim) # interest_emb = tf.matmul(item_att_w, item_list_emb) self.user_eb = interest_emb # item_list_emb = [-1, seq_len, embedding_dim] # atten: (batch, num_heads, dim) * (batch, dim, 1) = (batch, num_heads, 1) atten = tf.matmul(self.user_eb, tf.reshape(self.item_eb, [get_shape(item_list_emb)[0], self.dim, 1])) atten = tf.nn.softmax(tf.pow(tf.reshape(atten, [get_shape(item_list_emb)[0], num_heads]), 1)) # 找出与target item最相似的用户兴趣向量 readout = tf.gather(tf.reshape(self.user_eb, [-1, self.dim]), tf.argmax(atten, axis=1, output_type=tf.int32) + tf.range( tf.shape(item_list_emb)[0]) * num_heads) self.build_sampled_softmax_loss(self.item_eb, readout)
def encode(inputs, is_training=True, scope="encoder", reuse=None): ''' Args: inputs: A 2d tensor with shape of [N, T], dtype of int32. N: batch_size T: real length seqlens: A 1d tensor with shape of [N,], dtype of int32. masks: A 3d tensor with shape of [N, T, 1], dtype of float32. is_training: Whether or not the layer is in training mode. scope: Optional scope for `variable_scope` reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: E is the spectrogram filter N A collection of Hidden vectors, whose shape is (N, T, E). N seqs, each with T characters, and each of them encoded to E dimension latent representation ''' with tf.variable_scope(scope, reuse=reuse): # Load vocabulary #char2idx, idx2char = load_vocab() # Character Embedding N seqs #inputs = mod.embed(inputs, len(char2idx), hp.embed_size) # (N, T, E) shape=(32, ?, 256) # Encoder pre-net: dense(E)--dropout--dense(E/2)--dropout #ipdb.set_trace() inputs = mod.pre_spectro(inputs, is_training=is_training) # (N, T, E) prenet_out = mod.prenet(inputs, is_training=is_training) # (N, T, E/2) # Encoder CBHG ## Conv1D bank enc = mod.conv1d_banks(prenet_out, K=hp.encoder_num_banks, is_training=is_training) # (N, T, K * E / 2) ### Max pooling enc = tf.layers.max_pooling1d(enc, 2, 1, padding="same") # (N, T, K * E / 2) ### Conv1D projections enc = mod.conv1d(enc, hp.embed_size // 2, 3, scope="conv1d_1") # (N, T, E/2) enc = mod.normalize(enc, type=hp.norm_type, is_training=is_training, activation_fn=tf.nn.relu, scope="norm1") enc = mod.conv1d(enc, hp.embed_size // 2, 3, scope="conv1d_2") # (N, T, E/2) enc = mod.normalize(enc, type=hp.norm_type, is_training=is_training, activation_fn=None, scope="norm2") enc += prenet_out # (N, T, E/2) # residual connections ### Highway Nets for i in range(hp.num_highwaynet_blocks): enc = mod.highwaynet( enc, num_units=hp.embed_size // 2, scope='highwaynet_{}'.format(i)) # (N, T, E/2) ### Bidirectional GRU---apply nonlineararity memory = mod.gru( enc, hp.embed_size // 2, False ) # (N, T, E) what the network represent the input text input return memory