Example #1
0
    def __init__(self, config=None, training=True, train_form='Both'):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()
        self.graph = tf.Graph()
        with self.graph.as_default():
            if training:
                self.origx, self.x, self.y1, self.y2, self.y3, self.num_batch = get_batch(
                    config, train_form)
                self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers,
                                                             self.num_batch),
                                                      dtype=tf.int32)

            else:  # Evaluation
                self.x = tf.placeholder(tf.int32, shape=(1, hp.T_x))
                self.y1 = tf.placeholder(tf.float32,
                                         shape=(1, hp.T_y // hp.r,
                                                hp.n_mels * hp.r))
                self.prev_max_attentions_li = tf.placeholder(tf.int32,
                                                             shape=(
                                                                 hp.dec_layers,
                                                                 1,
                                                             ))

# Get decoder inputs: feed last frames only
            if train_form != 'Converter':
                self.decoder_input = tf.concat(
                    (tf.zeros_like(self.y1[:, :1, -hp.n_mels:]),
                     self.y1[:, :-1, -hp.n_mels:]), 1)

            # Networks
            if train_form != 'Converter':
                with tf.variable_scope("encoder"):
                    self.encoded = encoder(self.x, training=training)

                with tf.variable_scope("decoder"):
                    self.mel_logits, self.done_output, self.max_attentions_li = decoder(
                        self.decoder_input,
                        self.encoded,
                        self.prev_max_attentions_li,
                        training=training)
                    #self.mel_output = self.mel_logits
                    self.mel_output = tf.nn.sigmoid(self.mel_logits)

            if train_form == 'Both':
                with tf.variable_scope("converter"):
                    #self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels))
                    self.converter_input = self.mel_output
                    self.mag_logits = converter(self.converter_input,
                                                training=training)
                    self.mag_output = tf.nn.sigmoid(self.mag_logits)
            elif train_form == 'Converter':
                with tf.variable_scope("converter"):
                    #self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels))
                    self.converter_input = self.y1
                    self.mag_logits = converter(self.converter_input,
                                                training=training)
                    self.mag_output = tf.nn.sigmoid(self.mag_logits)

            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)

            if training:
                # Loss
                if train_form != 'Converter':
                    self.loss1 = tf.reduce_mean(
                        tf.abs(self.mel_output - self.y1))
                    if hp.include_dones:
                        self.loss2 = tf.reduce_mean(
                            tf.nn.sparse_softmax_cross_entropy_with_logits(
                                logits=self.done_output, labels=self.y2))
                if train_form != 'Encoder':
                    self.loss3 = tf.reduce_mean(
                        tf.abs(self.mag_output - self.y3))

                if train_form == 'Both':
                    if hp.include_dones:
                        self.loss = self.loss1 + self.loss2 + self.loss3
                    else:
                        self.loss = self.loss1 + self.loss3
                elif train_form == 'Encoder':
                    if hp.include_dones:
                        self.loss = self.loss1 + self.loss2
                    else:
                        self.loss = self.loss1
                else:
                    self.loss = self.loss3

                # Training Scheme
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
                ## gradient clipping
                self.gvs = self.optimizer.compute_gradients(self.loss)
                self.clipped = []
                for grad, var in self.gvs:
                    grad = grad if grad is None else tf.clip_by_value(
                        grad, -1. * hp.max_grad_val, hp.max_grad_val)
                    grad = grad if grad is None else tf.clip_by_norm(
                        grad, hp.max_grad_norm)
                    self.clipped.append((grad, var))

                self.train_op = self.optimizer.apply_gradients(
                    self.clipped, global_step=self.global_step)

                # Summary
                tf.summary.scalar('loss', self.loss)

                if train_form != 'Converter':
                    tf.summary.histogram('mel_output', self.mel_output)
                    tf.summary.histogram('mel_actual', self.y1)
                    tf.summary.scalar('loss1', self.loss1)
                    if hp.include_dones:
                        tf.summary.histogram('done_output', self.done_output)
                        tf.summary.histogram('done_actual', self.y2)
                        tf.summary.scalar('loss2', self.loss2)
                if train_form != 'Encoder':
                    tf.summary.histogram('mag_output', self.mag_output)
                    tf.summary.histogram('mag_actual', self.y3)
                    tf.summary.scalar('loss3', self.loss3)

                self.merged = tf.summary.merge_all()
Example #2
0
    def __init__(self, config=None, training=True):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Data Feeding
            ## x: Text. (N, T_x), int32
            ## y1: Reduced melspectrogram. (N, T_y//r, n_mels*r) float32
            ## y2: Reduced dones. (N, T_y//r,) int32
            ## z: Magnitude. (N, T_y, n_fft//2+1) float32
            if training:
                self.origx, self.x, self.y1, self.y2, self.y3, self.num_batch = get_batch(
                    config)
                #self.origx, self.x, self.y1, self.y3, self.num_batch = get_batch(config)
                self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers,
                                                             hp.batch_size),
                                                      dtype=tf.int32)

            else:  # Evaluation
                self.x = tf.placeholder(tf.int32, shape=(1, hp.T_x))
                self.y1 = tf.placeholder(tf.float32,
                                         shape=(1, hp.T_y // hp.r,
                                                hp.n_mels * hp.r))
                self.prev_max_attentions_li = tf.placeholder(tf.int32,
                                                             shape=(
                                                                 hp.dec_layers,
                                                                 1,
                                                             ))

# Get decoder inputs: feed last frames only (N, Ty//r, n_mels)
            self.decoder_input = tf.concat((tf.zeros_like(
                self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1)

            # Networks
            with tf.variable_scope("encoder"):
                self.keys, self.vals = encoder(self.x,
                                               training=training)  # (N, Tx, e)

            with tf.variable_scope("decoder"):
                #self.mel_logits, self.decoder_output, self.alignments_li, self.max_attentions_li \
                self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li \
                    = decoder(self.decoder_input,
                             self.keys,
                             self.vals,
                             self.prev_max_attentions_li,
                             training=training)
                self.mel_output = tf.nn.sigmoid(self.mel_logits)

            with tf.variable_scope("converter"):
                # Restore shape
                self.converter_input = tf.reshape(
                    self.decoder_output, (-1, hp.T_y, hp.embed_size // hp.r))
                self.converter_input = fc_block(
                    self.converter_input,
                    hp.converter_channels,
                    activation_fn=tf.nn.relu,
                    training=training)  # (N, Ty, v)

                # Converter
                #self.mag_logits = converter(self.converter_input, training=training)
                # self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels))
                self.mag_logits = converter(self.converter_input,
                                            training=training)
                self.mag_output = tf.nn.sigmoid(self.mag_logits)

            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)

            if training:
                # Loss
                self.loss1 = tf.reduce_mean(tf.abs(self.mel_output - self.y1))
                self.loss2 = tf.reduce_mean(
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.done_output, labels=self.y2))
                self.loss3 = tf.reduce_mean(tf.abs(self.mag_output - self.y3))
                self.loss = self.loss1 + self.loss2 + self.loss3
                #self.loss = self.loss1 + self.loss3

                # Training Scheme
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
                ## gradient clipping
                self.gvs = self.optimizer.compute_gradients(self.loss)
                self.clipped = []
                for grad, var in self.gvs:
                    grad = grad if grad is None else tf.clip_by_value(
                        grad, -1. * hp.max_grad_val, hp.max_grad_val)
                    grad = grad if grad is None else tf.clip_by_norm(
                        grad, hp.max_grad_norm)
                    self.clipped.append((grad, var))

                self.train_op = self.optimizer.apply_gradients(
                    self.clipped, global_step=self.global_step)

                # Summary
                tf.summary.histogram('mel_output', self.mel_output)
                tf.summary.histogram('mel_actual', self.y1)
                tf.summary.histogram('done_output', self.done_output)
                tf.summary.histogram('done_actual', self.y2)
                tf.summary.histogram('mag_output', self.mag_output)
                tf.summary.histogram('mag_actual', self.y3)

                tf.summary.scalar('loss', self.loss)
                tf.summary.scalar('loss1', self.loss1)
                tf.summary.scalar('loss2', self.loss2)
                tf.summary.scalar('loss3', self.loss3)

                self.merged = tf.summary.merge_all()
Example #3
0
    def __init__(self, training=True):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()

        # Graph
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Data Feeding
            ## x: Text. (N, Tx), int32
            ## y1: Reduced melspectrogram. (N, Ty//r, n_mels*r) float32
            ## y2: Reduced dones. (N, Ty//r,) int32
            ## z: Magnitude. (N, Ty, n_fft//2+1) float32
            if training:
                self.x, self.y1, self.y2, self.z, self.num_batch = get_batch()
                self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, hp.batch_size), dtype=tf.int32)
            else: # Inference
                self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.Tx))
                self.y1 = tf.placeholder(tf.float32, shape=(hp.batch_size, hp.Ty//hp.r, hp.n_mels*hp.r))
                self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=(hp.dec_layers, hp.batch_size,))

            # Get decoder inputs: feed last frames only (N, Ty//r, n_mels)
            self.decoder_input = tf.concat((tf.zeros_like(self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1)

            # Networks
            with tf.variable_scope("encoder"):
                self.keys, self.vals = encoder(self.x, training=training) # (N, Tx, e)

            with tf.variable_scope("decoder"):
                # mel_logits: (N, Ty/r, n_mels*r)
                # done_output: (N, Ty/r, 2),
                # decoder_output: (N, Ty/r, e)
                # alignments_li: dec_layers*(Tx, Ty/r)
                # max_attentions_li: dec_layers*(N, T_y/r)
                self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li \
                    = decoder(self.decoder_input,
                             self.keys,
                             self.vals,
                             self.prev_max_attentions_li,
                             training=training)
                self.mel_output = tf.nn.sigmoid(self.mel_logits)

            with tf.variable_scope("converter"):
                # Restore shape
                self.converter_input = tf.reshape(self.decoder_output, (-1, hp.Ty, hp.embed_size//hp.r))
                self.converter_input = fc_block(self.converter_input,
                                                hp.converter_channels,
                                                activation_fn=tf.nn.relu,
                                                training=training) # (N, Ty, v)

                # Converter
                self.mag_logits = converter(self.converter_input, training=training) # (N, Ty, 1+n_fft//2)
                self.mag_output = tf.nn.sigmoid(self.mag_logits)

            self.global_step = tf.Variable(0, name='global_step', trainable=False)
            if training:
                # Loss
                self.loss_mels = tf.reduce_mean(tf.abs(self.mel_output - self.y1))
                self.loss_dones = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.done_output, labels=self.y2))
                self.loss_mags = tf.reduce_mean(tf.abs(self.mag_output - self.z))
                self.loss = self.loss_mels + self.loss_dones + self.loss_mags

                # Training Scheme
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
                ## gradient clipping
                self.gvs = self.optimizer.compute_gradients(self.loss)
                self.clipped = []
                for grad, var in self.gvs:
                    grad = tf.clip_by_value(grad, -1. * hp.max_grad_val, hp.max_grad_val)
                    grad = tf.clip_by_norm(grad, hp.max_grad_norm)
                    self.clipped.append((grad, var))
                self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step)
                   
                # Summary
                tf.summary.scalar('Train_Loss/LOSS', self.loss)
                tf.summary.scalar('Train_Loss/mels', self.loss_mels)
                tf.summary.scalar('Train_Loss/dones', self.loss_dones)
                tf.summary.scalar('Train_Loss/mags', self.loss_mags)

                self.merged = tf.summary.merge_all()
Example #4
0
    def __init__(self, training=True):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Data Feeding
            ## x: Text. (N, T_x), int32
            ## y1: Reduced melspectrogram. (N, T_y//r, n_mels*r) float32
            ## y2: Reduced dones. (N, T_y//r,) int32
            ## z: Magnitude. (N, T_y, n_fft//2+1) float32
            if training:
                self.x, self.y1, self.y2, self.z, self.num_batch = get_batch()
                self.prev_max_attentions = tf.constant([0] * hp.batch_size)
            else:  # Evaluation
                self.x = tf.placeholder(tf.int32,
                                        shape=(hp.batch_size, hp.T_x))
                self.y1 = tf.placeholder(tf.float32,
                                         shape=(hp.batch_size, hp.T_y // hp.r,
                                                hp.n_mels * hp.r))
                self.prev_max_attentions = tf.placeholder(
                    tf.int32, shape=(hp.batch_size, ))

            # Get decoder inputs: feed last frames only (N, T_y//r, n_mels)
            self.decoder_input = tf.concat((tf.zeros_like(
                self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1)

            # Networks
            with tf.variable_scope("net"):
                # Encoder. keys: (N, T_x, e), vals: (N, T_x, e)
                self.keys, self.vals, self.masks = encoder(self.x,
                                                           training=training,
                                                           scope="encoder")

                # Decoder. mel_output: (N, T_y/r, n_mels*r), done_output: (N, T_y/r, 2),
                # decoder_output: (N, T_y/r, e), alignments: (N, T_y, T_x)
                self.mel_output, self.done_output, self.decoder_output, self.alignments, self.max_attentions = decoder(
                    self.decoder_input,
                    self.keys,
                    self.vals,
                    self.masks,
                    self.prev_max_attentions,
                    training=training,
                    scope="decoder",
                    reuse=None)
                # Restore shape. converter_input: (N, T_y, e/r)
                self.converter_input = tf.reshape(self.decoder_output,
                                                  (hp.batch_size, hp.T_y, -1))
                self.converter_input = normalize(self.converter_input,
                                                 type=hp.norm_type,
                                                 training=training,
                                                 activation_fn=tf.nn.relu)

                # Converter. mag_output: (N, T_y, 1+n_fft//2)
                self.mag_output = converter(self.converter_input,
                                            training=training,
                                            scope="converter")
            if training:
                # Loss
                self.loss1_mae = tf.reduce_mean(
                    tf.abs(self.mel_output - self.y1))
                self.loss1_ce = tf.reduce_mean(
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.done_output, labels=self.y2))
                self.loss2 = tf.reduce_mean(tf.abs(self.mag_output - self.z))
                self.loss = self.loss1_mae + self.loss1_ce + self.loss2

                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
                ## gradient clipping
                self.gvs = self.optimizer.compute_gradients(self.loss)
                self.clipped = []
                for grad, var in self.gvs:
                    grad = tf.clip_by_value(grad, -1. * hp.max_grad_val,
                                            hp.max_grad_val)
                    grad = tf.clip_by_norm(grad, hp.max_grad_norm)
                    self.clipped.append((grad, var))
                self.train_op = self.optimizer.apply_gradients(
                    self.clipped, global_step=self.global_step)

                # Summary
                tf.summary.scalar('loss', self.loss)
                tf.summary.scalar('loss1_mae', self.loss1_mae)
                tf.summary.scalar('loss1_ce', self.loss1_ce)
                tf.summary.scalar('loss2', self.loss2)

                self.merged = tf.summary.merge_all()
Example #5
0
    def __init__(self, training=True):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()

        # Graph
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Data Feeding
            ## x: Text. (N, Tx), int32
            ## y1: Melspectrogram. (N, Ty, n_mels) float32
            ## y2: Dones. (N, Ty) int32
            ## z: Magnitude. (N, Ty, n_fft//2+1) float32
            if training:
                self.x, self.y1, self.y2, self.z = get_batch()
                self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers,
                                                             hp.batch_size),
                                                      dtype=tf.int32)
            else:  # Inference
                self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.Tx))
                self.y1 = tf.placeholder(tf.float32,
                                         shape=(hp.batch_size, hp.Ty // hp.r,
                                                hp.n_mels * hp.r))
                self.prev_max_attentions_li = tf.placeholder(tf.int32,
                                                             shape=(
                                                                 hp.dec_layers,
                                                                 hp.batch_size,
                                                             ))

            # Get decoder inputs: feed last frames only (N, Ty, n_mels)
            self.decoder_input = tf.concat((tf.zeros_like(
                self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1)

            # Networks
            with tf.variable_scope("encoder"):
                self.keys, self.vals = encoder(self.x,
                                               training=training)  # (N, Tx, e)

            with tf.variable_scope("decoder"):
                # mel_logits: (N, Ty, n_mels)
                # done_output: (N, Ty, 2),
                # decoder_output: (N, Ty, e)
                # alignments_li: dec_layers*(Tx, Ty)
                # max_attentions_li: dec_layers*(N, T_y)
                self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li = decoder(
                    self.decoder_input,
                    self.keys,
                    self.vals,
                    self.prev_max_attentions_li,
                    training=training)
                self.mel_output = tf.nn.sigmoid(self.mel_logits)

            with tf.variable_scope("converter"):
                # Restore shape
                self.converter_input = tf.reshape(self.decoder_output,
                                                  (-1, hp.Ty, hp.embed_size))
                self.converter_input = fc_block(
                    self.converter_input,
                    hp.converter_channels,
                    activation_fn=tf.nn.relu,
                    training=training)  # (N, Ty, v)

                # Converter
                self.mag_logits = converter(
                    self.converter_input,
                    training=training)  # (N, Ty, 1+n_fft//2)
                self.mag_output = tf.nn.sigmoid(self.mag_logits)

            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)
            if training:
                # Loss
                self.loss_mels = tf.reduce_mean(
                    tf.abs(self.mel_output - self.y1))
                self.loss_dones = tf.reduce_mean(
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.done_output, labels=self.y2))
                self.loss_mags = tf.reduce_mean(
                    tf.abs(self.mag_output - self.z))
                self.loss = self.loss_mels + self.loss_dones + self.loss_mags

                # Training Scheme
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
                ## gradient clipping
                self.gvs = self.optimizer.compute_gradients(self.loss)
                self.clipped = []
                for grad, var in self.gvs:
                    grad = tf.clip_by_value(grad, -1. * hp.max_grad_val,
                                            hp.max_grad_val)
                    grad = tf.clip_by_norm(grad, hp.max_grad_norm)
                    self.clipped.append((grad, var))
                self.train_op = self.optimizer.apply_gradients(
                    self.clipped, global_step=self.global_step)

                # Summary
                tf.summary.scalar('Train_Loss/LOSS', self.loss)
                tf.summary.scalar('Train_Loss/mels', self.loss_mels)
                tf.summary.scalar('Train_Loss/dones', self.loss_dones)
                tf.summary.scalar('Train_Loss/mags', self.loss_mags)

                self.merged = tf.summary.merge_all()