def __init__(self, config=None, training=True, train_form='Both'): # Load vocabulary self.char2idx, self.idx2char = load_vocab() self.graph = tf.Graph() with self.graph.as_default(): if training: self.origx, self.x, self.y1, self.y2, self.y3, self.num_batch = get_batch( config, train_form) self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, self.num_batch), dtype=tf.int32) else: # Evaluation self.x = tf.placeholder(tf.int32, shape=(1, hp.T_x)) self.y1 = tf.placeholder(tf.float32, shape=(1, hp.T_y // hp.r, hp.n_mels * hp.r)) self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=( hp.dec_layers, 1, )) # Get decoder inputs: feed last frames only if train_form != 'Converter': self.decoder_input = tf.concat( (tf.zeros_like(self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks if train_form != 'Converter': with tf.variable_scope("encoder"): self.encoded = encoder(self.x, training=training) with tf.variable_scope("decoder"): self.mel_logits, self.done_output, self.max_attentions_li = decoder( self.decoder_input, self.encoded, self.prev_max_attentions_li, training=training) #self.mel_output = self.mel_logits self.mel_output = tf.nn.sigmoid(self.mel_logits) if train_form == 'Both': with tf.variable_scope("converter"): #self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels)) self.converter_input = self.mel_output self.mag_logits = converter(self.converter_input, training=training) self.mag_output = tf.nn.sigmoid(self.mag_logits) elif train_form == 'Converter': with tf.variable_scope("converter"): #self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels)) self.converter_input = self.y1 self.mag_logits = converter(self.converter_input, training=training) self.mag_output = tf.nn.sigmoid(self.mag_logits) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: # Loss if train_form != 'Converter': self.loss1 = tf.reduce_mean( tf.abs(self.mel_output - self.y1)) if hp.include_dones: self.loss2 = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.done_output, labels=self.y2)) if train_form != 'Encoder': self.loss3 = tf.reduce_mean( tf.abs(self.mag_output - self.y3)) if train_form == 'Both': if hp.include_dones: self.loss = self.loss1 + self.loss2 + self.loss3 else: self.loss = self.loss1 + self.loss3 elif train_form == 'Encoder': if hp.include_dones: self.loss = self.loss1 + self.loss2 else: self.loss = self.loss1 else: self.loss = self.loss3 # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = grad if grad is None else tf.clip_by_value( grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = grad if grad is None else tf.clip_by_norm( grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('loss', self.loss) if train_form != 'Converter': tf.summary.histogram('mel_output', self.mel_output) tf.summary.histogram('mel_actual', self.y1) tf.summary.scalar('loss1', self.loss1) if hp.include_dones: tf.summary.histogram('done_output', self.done_output) tf.summary.histogram('done_actual', self.y2) tf.summary.scalar('loss2', self.loss2) if train_form != 'Encoder': tf.summary.histogram('mag_output', self.mag_output) tf.summary.histogram('mag_actual', self.y3) tf.summary.scalar('loss3', self.loss3) self.merged = tf.summary.merge_all()
def __init__(self, config=None, training=True): # Load vocabulary self.char2idx, self.idx2char = load_vocab() self.graph = tf.Graph() with self.graph.as_default(): # Data Feeding ## x: Text. (N, T_x), int32 ## y1: Reduced melspectrogram. (N, T_y//r, n_mels*r) float32 ## y2: Reduced dones. (N, T_y//r,) int32 ## z: Magnitude. (N, T_y, n_fft//2+1) float32 if training: self.origx, self.x, self.y1, self.y2, self.y3, self.num_batch = get_batch( config) #self.origx, self.x, self.y1, self.y3, self.num_batch = get_batch(config) self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, hp.batch_size), dtype=tf.int32) else: # Evaluation self.x = tf.placeholder(tf.int32, shape=(1, hp.T_x)) self.y1 = tf.placeholder(tf.float32, shape=(1, hp.T_y // hp.r, hp.n_mels * hp.r)) self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=( hp.dec_layers, 1, )) # Get decoder inputs: feed last frames only (N, Ty//r, n_mels) self.decoder_input = tf.concat((tf.zeros_like( self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks with tf.variable_scope("encoder"): self.keys, self.vals = encoder(self.x, training=training) # (N, Tx, e) with tf.variable_scope("decoder"): #self.mel_logits, self.decoder_output, self.alignments_li, self.max_attentions_li \ self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li \ = decoder(self.decoder_input, self.keys, self.vals, self.prev_max_attentions_li, training=training) self.mel_output = tf.nn.sigmoid(self.mel_logits) with tf.variable_scope("converter"): # Restore shape self.converter_input = tf.reshape( self.decoder_output, (-1, hp.T_y, hp.embed_size // hp.r)) self.converter_input = fc_block( self.converter_input, hp.converter_channels, activation_fn=tf.nn.relu, training=training) # (N, Ty, v) # Converter #self.mag_logits = converter(self.converter_input, training=training) # self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels)) self.mag_logits = converter(self.converter_input, training=training) self.mag_output = tf.nn.sigmoid(self.mag_logits) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: # Loss self.loss1 = tf.reduce_mean(tf.abs(self.mel_output - self.y1)) self.loss2 = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.done_output, labels=self.y2)) self.loss3 = tf.reduce_mean(tf.abs(self.mag_output - self.y3)) self.loss = self.loss1 + self.loss2 + self.loss3 #self.loss = self.loss1 + self.loss3 # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = grad if grad is None else tf.clip_by_value( grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = grad if grad is None else tf.clip_by_norm( grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.histogram('mel_output', self.mel_output) tf.summary.histogram('mel_actual', self.y1) tf.summary.histogram('done_output', self.done_output) tf.summary.histogram('done_actual', self.y2) tf.summary.histogram('mag_output', self.mag_output) tf.summary.histogram('mag_actual', self.y3) tf.summary.scalar('loss', self.loss) tf.summary.scalar('loss1', self.loss1) tf.summary.scalar('loss2', self.loss2) tf.summary.scalar('loss3', self.loss3) self.merged = tf.summary.merge_all()
def __init__(self, training=True): # Load vocabulary self.char2idx, self.idx2char = load_vocab() # Graph self.graph = tf.Graph() with self.graph.as_default(): # Data Feeding ## x: Text. (N, Tx), int32 ## y1: Reduced melspectrogram. (N, Ty//r, n_mels*r) float32 ## y2: Reduced dones. (N, Ty//r,) int32 ## z: Magnitude. (N, Ty, n_fft//2+1) float32 if training: self.x, self.y1, self.y2, self.z, self.num_batch = get_batch() self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, hp.batch_size), dtype=tf.int32) else: # Inference self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.Tx)) self.y1 = tf.placeholder(tf.float32, shape=(hp.batch_size, hp.Ty//hp.r, hp.n_mels*hp.r)) self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=(hp.dec_layers, hp.batch_size,)) # Get decoder inputs: feed last frames only (N, Ty//r, n_mels) self.decoder_input = tf.concat((tf.zeros_like(self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks with tf.variable_scope("encoder"): self.keys, self.vals = encoder(self.x, training=training) # (N, Tx, e) with tf.variable_scope("decoder"): # mel_logits: (N, Ty/r, n_mels*r) # done_output: (N, Ty/r, 2), # decoder_output: (N, Ty/r, e) # alignments_li: dec_layers*(Tx, Ty/r) # max_attentions_li: dec_layers*(N, T_y/r) self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li \ = decoder(self.decoder_input, self.keys, self.vals, self.prev_max_attentions_li, training=training) self.mel_output = tf.nn.sigmoid(self.mel_logits) with tf.variable_scope("converter"): # Restore shape self.converter_input = tf.reshape(self.decoder_output, (-1, hp.Ty, hp.embed_size//hp.r)) self.converter_input = fc_block(self.converter_input, hp.converter_channels, activation_fn=tf.nn.relu, training=training) # (N, Ty, v) # Converter self.mag_logits = converter(self.converter_input, training=training) # (N, Ty, 1+n_fft//2) self.mag_output = tf.nn.sigmoid(self.mag_logits) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: # Loss self.loss_mels = tf.reduce_mean(tf.abs(self.mel_output - self.y1)) self.loss_dones = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.done_output, labels=self.y2)) self.loss_mags = tf.reduce_mean(tf.abs(self.mag_output - self.z)) self.loss = self.loss_mels + self.loss_dones + self.loss_mags # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_value(grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = tf.clip_by_norm(grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('Train_Loss/LOSS', self.loss) tf.summary.scalar('Train_Loss/mels', self.loss_mels) tf.summary.scalar('Train_Loss/dones', self.loss_dones) tf.summary.scalar('Train_Loss/mags', self.loss_mags) self.merged = tf.summary.merge_all()
def __init__(self, training=True): # Load vocabulary self.char2idx, self.idx2char = load_vocab() self.graph = tf.Graph() with self.graph.as_default(): # Data Feeding ## x: Text. (N, T_x), int32 ## y1: Reduced melspectrogram. (N, T_y//r, n_mels*r) float32 ## y2: Reduced dones. (N, T_y//r,) int32 ## z: Magnitude. (N, T_y, n_fft//2+1) float32 if training: self.x, self.y1, self.y2, self.z, self.num_batch = get_batch() self.prev_max_attentions = tf.constant([0] * hp.batch_size) else: # Evaluation self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.T_x)) self.y1 = tf.placeholder(tf.float32, shape=(hp.batch_size, hp.T_y // hp.r, hp.n_mels * hp.r)) self.prev_max_attentions = tf.placeholder( tf.int32, shape=(hp.batch_size, )) # Get decoder inputs: feed last frames only (N, T_y//r, n_mels) self.decoder_input = tf.concat((tf.zeros_like( self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks with tf.variable_scope("net"): # Encoder. keys: (N, T_x, e), vals: (N, T_x, e) self.keys, self.vals, self.masks = encoder(self.x, training=training, scope="encoder") # Decoder. mel_output: (N, T_y/r, n_mels*r), done_output: (N, T_y/r, 2), # decoder_output: (N, T_y/r, e), alignments: (N, T_y, T_x) self.mel_output, self.done_output, self.decoder_output, self.alignments, self.max_attentions = decoder( self.decoder_input, self.keys, self.vals, self.masks, self.prev_max_attentions, training=training, scope="decoder", reuse=None) # Restore shape. converter_input: (N, T_y, e/r) self.converter_input = tf.reshape(self.decoder_output, (hp.batch_size, hp.T_y, -1)) self.converter_input = normalize(self.converter_input, type=hp.norm_type, training=training, activation_fn=tf.nn.relu) # Converter. mag_output: (N, T_y, 1+n_fft//2) self.mag_output = converter(self.converter_input, training=training, scope="converter") if training: # Loss self.loss1_mae = tf.reduce_mean( tf.abs(self.mel_output - self.y1)) self.loss1_ce = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.done_output, labels=self.y2)) self.loss2 = tf.reduce_mean(tf.abs(self.mag_output - self.z)) self.loss = self.loss1_mae + self.loss1_ce + self.loss2 # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_value(grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = tf.clip_by_norm(grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('loss', self.loss) tf.summary.scalar('loss1_mae', self.loss1_mae) tf.summary.scalar('loss1_ce', self.loss1_ce) tf.summary.scalar('loss2', self.loss2) self.merged = tf.summary.merge_all()
def __init__(self, training=True): # Load vocabulary self.char2idx, self.idx2char = load_vocab() # Graph self.graph = tf.Graph() with self.graph.as_default(): # Data Feeding ## x: Text. (N, Tx), int32 ## y1: Melspectrogram. (N, Ty, n_mels) float32 ## y2: Dones. (N, Ty) int32 ## z: Magnitude. (N, Ty, n_fft//2+1) float32 if training: self.x, self.y1, self.y2, self.z = get_batch() self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, hp.batch_size), dtype=tf.int32) else: # Inference self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.Tx)) self.y1 = tf.placeholder(tf.float32, shape=(hp.batch_size, hp.Ty // hp.r, hp.n_mels * hp.r)) self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=( hp.dec_layers, hp.batch_size, )) # Get decoder inputs: feed last frames only (N, Ty, n_mels) self.decoder_input = tf.concat((tf.zeros_like( self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks with tf.variable_scope("encoder"): self.keys, self.vals = encoder(self.x, training=training) # (N, Tx, e) with tf.variable_scope("decoder"): # mel_logits: (N, Ty, n_mels) # done_output: (N, Ty, 2), # decoder_output: (N, Ty, e) # alignments_li: dec_layers*(Tx, Ty) # max_attentions_li: dec_layers*(N, T_y) self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li = decoder( self.decoder_input, self.keys, self.vals, self.prev_max_attentions_li, training=training) self.mel_output = tf.nn.sigmoid(self.mel_logits) with tf.variable_scope("converter"): # Restore shape self.converter_input = tf.reshape(self.decoder_output, (-1, hp.Ty, hp.embed_size)) self.converter_input = fc_block( self.converter_input, hp.converter_channels, activation_fn=tf.nn.relu, training=training) # (N, Ty, v) # Converter self.mag_logits = converter( self.converter_input, training=training) # (N, Ty, 1+n_fft//2) self.mag_output = tf.nn.sigmoid(self.mag_logits) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: # Loss self.loss_mels = tf.reduce_mean( tf.abs(self.mel_output - self.y1)) self.loss_dones = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.done_output, labels=self.y2)) self.loss_mags = tf.reduce_mean( tf.abs(self.mag_output - self.z)) self.loss = self.loss_mels + self.loss_dones + self.loss_mags # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_value(grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = tf.clip_by_norm(grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('Train_Loss/LOSS', self.loss) tf.summary.scalar('Train_Loss/mels', self.loss_mels) tf.summary.scalar('Train_Loss/dones', self.loss_dones) tf.summary.scalar('Train_Loss/mags', self.loss_mags) self.merged = tf.summary.merge_all()