コード例 #1
0
ファイル: rnn_ax_open_plus2.py プロジェクト: liangoy/bphs
        a.append(
            np.concatenate([sample[:-1, :5], [[sample[-1][0]]] * (long - 1)],
                           axis=-1))
        b.append(sample[:-1, 5:10])
        c.append(sample[-1][1])
    return a, b, c


x = tf.placeholder(shape=[batch_size, long - 1, 6], dtype=tf.float16)
y = tf.placeholder(shape=[batch_size, long - 1, 5], dtype=tf.float16)
z_ = tf.placeholder(shape=[batch_size], dtype=tf.float16)

X = tf.nn.sigmoid(x) - 0.5
Y = tf.nn.sigmoid(y) - 0.5

gru_x = GRUCell(num_units=8, reuse=tf.AUTO_REUSE, activation=tf.nn.elu)
state_x = gru_x.zero_state(batch_size, dtype=tf.float16)
with tf.variable_scope('RNN_x'):
    for timestep in range(long - 1):
        if timestep == 1:
            tf.get_variable_scope().reuse_variables()
        (cell_output_x, state_x) = gru_x(X[:, timestep], state_x)
    out_put_x = state_x

gru_y = GRUCell(num_units=8, reuse=tf.AUTO_REUSE, activation=tf.nn.elu)
state_y = gru_y.zero_state(batch_size, dtype=tf.float16)
with tf.variable_scope('RNN_y'):
    for timestep in range(long - 1):  # be careful
        if timestep == 1:
            tf.get_variable_scope().reuse_variables()
        (cell_output_y, state_y) = gru_y(Y[:, timestep], state_y)
コード例 #2
0
ファイル: model.py プロジェクト: ustcsky/PURS
    def __init__(self, user_count, item_count, batch_size):
        hidden_size = 128
        long_memory_window = 10
        short_memory_window = 3

        self.u = tf.placeholder(tf.int32, [
            batch_size,
        ])  # [B]
        self.i = tf.placeholder(tf.int32, [
            batch_size,
        ])  # [B]
        self.y = tf.placeholder(tf.float32, [
            batch_size,
        ])  # [B]
        self.hist = tf.placeholder(tf.int32,
                                   [batch_size, long_memory_window])  # [B, T]
        self.lr = tf.placeholder(tf.float64, [])

        user_emb_w = tf.get_variable("user_emb_w",
                                     [user_count, hidden_size // 2])
        item_emb_w = tf.get_variable("item_emb_w",
                                     [item_count, hidden_size // 2])
        user_b = tf.get_variable(
            "user_b",
            [user_count],
            initializer=tf.constant_initializer(0.0),
        )
        item_b = tf.get_variable("item_b", [item_count],
                                 initializer=tf.constant_initializer(0.0))

        item_emb = tf.concat([
            tf.nn.embedding_lookup(item_emb_w, self.i),
            tf.nn.embedding_lookup(user_emb_w, self.u),
        ],
                             axis=1)
        item_b = tf.gather(item_b, self.i)
        user_b = tf.gather(user_b, self.u)
        h_emb = tf.concat([
            tf.nn.embedding_lookup(
                item_emb_w,
                tf.slice(self.hist, [0, 0], [batch_size, long_memory_window])),
            tf.tile(
                tf.expand_dims(tf.nn.embedding_lookup(user_emb_w, self.u), 1),
                [1, long_memory_window, 1]),
        ],
                          axis=2)
        unexp_emb = tf.concat([
            tf.nn.embedding_lookup(
                item_emb_w,
                tf.slice(self.hist,
                         [0, long_memory_window - short_memory_window],
                         [batch_size, short_memory_window])),
            tf.tile(
                tf.expand_dims(tf.nn.embedding_lookup(user_emb_w, self.u), 1),
                [1, short_memory_window, 1]),
        ],
                              axis=2)
        h_long_emb = tf.nn.embedding_lookup(
            item_emb_w,
            tf.slice(self.hist, [0, 0], [batch_size, long_memory_window]))
        h_short_emb = tf.nn.embedding_lookup(
            item_emb_w,
            tf.slice(self.hist, [0, long_memory_window - short_memory_window],
                     [batch_size, short_memory_window]))

        # Long-Short-Term User Preference
        #with tf.variable_scope('rnn', reuse=tf.AUTO_REUSE):
        long_output, _ = tf.nn.dynamic_rnn(GRUCell(hidden_size),
                                           inputs=h_emb,
                                           dtype=tf.float32)
        long_preference, _ = self.seq_attention(long_output, hidden_size,
                                                long_memory_window)
        long_preference = tf.nn.dropout(long_preference, 0.1)
        #short_output, _ = tf.nn.dynamic_rnn(GRUCell(hidden_size), inputs=unexp_emb, dtype=tf.float32)
        #short_preference, _ = self.seq_attention(short_output, hidden_size, long_memory_window)
        #short_preference = tf.nn.dropout(short_preference, 0.1)

        #Combine Long-Short-Term-User-Preferences
        concat = tf.concat([long_preference, item_emb], axis=1)
        concat = tf.layers.batch_normalization(inputs=concat)
        concat = tf.layers.dense(concat,
                                 80,
                                 activation=tf.nn.sigmoid,
                                 name='f1')
        concat = tf.layers.dense(concat,
                                 40,
                                 activation=tf.nn.sigmoid,
                                 name='f2')
        concat = tf.layers.dense(concat, 1, activation=None, name='f3')
        concat = tf.reshape(concat, [-1])

        #Personalized & Contextualized Unexpected Factor
        unexp_factor = self.unexp_attention(item_emb, unexp_emb,
                                            [long_memory_window] * batch_size)
        unexp_factor = tf.layers.batch_normalization(inputs=unexp_factor)
        unexp_factor = tf.reshape(unexp_factor, [-1, hidden_size])
        unexp_factor = tf.layers.dense(unexp_factor, hidden_size)
        unexp_factor = tf.layers.dense(unexp_factor, 1, activation=None)
        #If we choose to use binary values
        #unexp_gate = tf.to_float(tf.reshape(unexp_gate, [-1]) > 0.5)
        unexp_factor = tf.reshape(unexp_factor, [-1])

        #Unexpectedness (with clustering of user interests)
        self.center = self.mean_shift(h_long_emb)
        unexp = tf.reduce_mean(self.center, axis=1)
        unexp = tf.norm(unexp - tf.nn.embedding_lookup(item_emb_w, self.i),
                        ord='euclidean',
                        axis=1)
        self.unexp = unexp
        unexp = tf.exp(-1.0 * unexp) * unexp  #Unexpected Activation Function
        unexp = tf.stop_gradient(unexp)

        #Relevance (for future exploration)
        relevance = tf.reduce_mean(h_long_emb, axis=1)
        relevance = tf.norm(relevance -
                            tf.nn.embedding_lookup(item_emb_w, self.i),
                            ord='euclidean',
                            axis=1)

        #Annoyance/Diversification (for future exploration)
        annoyance = tf.reduce_mean(h_short_emb, axis=1)
        annoyance = tf.norm(annoyance -
                            tf.nn.embedding_lookup(item_emb_w, self.i),
                            ord='euclidean',
                            axis=1)

        #Estmation of user preference by combing different components
        self.logits = item_b + concat + user_b + unexp_factor * unexp  # [B]exp
        self.score = tf.sigmoid(self.logits)

        # Step variable
        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        self.global_epoch_step = tf.Variable(0,
                                             trainable=False,
                                             name='global_epoch_step')
        self.global_epoch_step_op = tf.assign(self.global_epoch_step,
                                              self.global_epoch_step + 1)
        self.loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits,
                                                    labels=self.y))
        trainable_params = tf.trainable_variables()
        self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr)
        gradients = tf.gradients(self.loss, trainable_params)
        clip_gradients, _ = tf.clip_by_global_norm(gradients, 1)
        self.train_op = self.opt.apply_gradients(zip(clip_gradients,
                                                     trainable_params),
                                                 global_step=self.global_step)
コード例 #3
0
def main(model, T, n_iter, n_batch, n_hidden, capacity, comp, FFT,
         learning_rate, norm, update_gate, activation, lambd, layer_norm,
         zoneout, visualization_experiment):

    learning_rate = float(learning_rate)

    # data params
    n_input = int(T / 2) + 10 + 1

    n_output = 10
    n_train = 100000
    n_valid = 10000
    n_test = 20000

    n_steps = T + 3
    n_classes = 10

    # graph and gradients
    x = tf.placeholder("int32", [None, n_steps])
    y = tf.placeholder("int64", [None])

    input_data = tf.one_hot(x, n_input, dtype=tf.float32)

    # input to hidden
    if model == "LSTM":
        cell = BasicLSTMCell(n_hidden, state_is_tuple=True, forget_bias=1)
    elif model == "GRU":
        cell = GRUCell(n_hidden,
                       kernel_initializer=tf.orthogonal_initializer())
    elif model == "RUM":
        if activation == "relu":
            act = tf.nn.relu
        elif activation == "sigmoid":
            act = tf.nn.sigmoid
        elif activation == "tanh":
            act = tf.nn.tanh
        elif activation == "softsign":
            act = tf.nn.softsign
        if visualization_experiment:
            # placeholder
            temp_target = tf.placeholder("float32",
                                         [n_hidden + n_input, n_hidden])
            temp_target_bias = tf.placeholder("float32", [n_hidden])
            temp_embed = tf.placeholder("float32", [n_input, n_hidden])

        cell = cell = RUMCell(
            n_hidden,
            eta_=norm,
            update_gate=update_gate,
            lambda_=lambd,
            activation=act,
            use_layer_norm=layer_norm,
            use_zoneout=zoneout,
            visualization=visualization_experiment,
            temp_target=temp_target if visualization_experiment else None,
            temp_target_bias=temp_target_bias
            if visualization_experiment else None,
            temp_embed=temp_embed if visualization_experiment else None)

    elif model == "EUNN":
        cell = EUNNCell(n_hidden, capacity, FFT, comp)
    elif model == "GORU":
        if visualization_experiment:
            # placeholder
            temp_theta0 = tf.placeholder("float32", [n_hidden // 2])
            temp_theta1 = tf.placeholder("float32", [n_hidden // 2 - 1])
        cell = GORUCell(n_hidden,
                        capacity,
                        FFT,
                        temp_theta0=temp_theta0,
                        temp_theta1=temp_theta1)
    elif model == "RNN":
        cell = BasicRNNCell(n_hidden)

    hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32)

    # RESEARCH RELATED
    # hidden_out = hidden_out[:,:,:50]
    # costh = hidden_out[:,:,-1]
    # print(colored(hidden_out,'red'))
    # print(colored(costh, 'green'))

    # costh_mean_dist = tf.reduce_mean(costh, axis=0)
    # costh_hist = tf.summary.histogram('costh',costh_mean_dist)
    # print(colored(costh_normalized_dist,'yellow'))

    # hidden to output
    V_init_val = np.sqrt(6.) / np.sqrt(n_output + n_input)

    V_weights = tf.get_variable("V_weights",
                                shape=[n_hidden, n_classes],
                                dtype=tf.float32,
                                initializer=tf.random_uniform_initializer(
                                    -V_init_val, V_init_val))
    V_bias = tf.get_variable("V_bias",
                             shape=[n_classes],
                             dtype=tf.float32,
                             initializer=tf.constant_initializer(0.01))

    hidden_out = tf.unstack(hidden_out, axis=1)[-1]
    temp_out = tf.matmul(hidden_out, V_weights)
    output_data = tf.nn.bias_add(temp_out, V_bias)

    # evaluate process
    cost = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output_data,
                                                       labels=y))
    tf.summary.scalar('cost', cost)
    correct_pred = tf.equal(tf.argmax(output_data, 1), y)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    tf.summary.scalar('accuracy', accuracy)

    # initialization
    optimizer = tf.train.AdamOptimizer(
        learning_rate=learning_rate).minimize(cost)
    init = tf.global_variables_initializer()

    # save
    filename = model + "_H" + str(n_hidden) + "_" + \
        ("L" + str(lambd) + "_" if lambd else "") + \
        ("E" + str(eta) + "_" if norm else "") + \
        ("A" + activation + "_" if activation else "") + \
        ("U_" if update_gate else "") + \
        ("Z_" if zoneout and model == "RUM" else "") + \
        ("ln_" if layer_norm and model == "RUM" else "") + \
        (str(capacity) if model in ["EUNN", "GORU"] else "") + \
        ("FFT_" if model in ["EUNN", "GORU"] and FFT else "") + \
        "B" + str(n_batch)
    save_path = os.path.join('../../train_log', 'recall', 'T' + str(T),
                             filename)

    file_manager(save_path)

    # what follows is task specific
    filepath = os.path.join(save_path, "eval.txt")
    if not os.path.exists(os.path.dirname(filepath)):
        try:
            os.makedirs(os.path.dirname(filepath))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise
    f = open(filepath, 'w')
    f.write(col("validation \n", 'r'))

    log(kwargs, save_path)

    merged_summary = tf.summary.merge_all()
    saver = tf.train.Saver()

    parameters_profiler()

    # train
    saver = tf.train.Saver()
    step = 0

    train_x, train_y = recall_data(T, n_train)
    val_x, val_y = recall_data(T, n_valid)
    test_x, test_y = recall_data(T, n_test)

    with tf.Session() as sess:
        sess.run(init)
        train_writer = tf.summary.FileWriter(save_path, sess.graph)

        steps = []
        losses = []
        accs = []

        while step < n_iter:
            batch_x, batch_y = next_batch(train_x, train_y, step, n_batch)

            # RESEARCH RELATED
            # acc, loss = \
            # sess.run([accuracy, cost], feed_dict={x: batch_x, y: batch_y})
            # costh_val = sess.run([costh], feed_dict={x: batch_x, y: batch_y})
            # print(colored(costh_val,'green'))
            # print(colored("###",'yellow'))
            # acc, loss, costh_h = \
            # sess.run([accuracy, cost, costh_hist], feed_dict={x: batch_x, y:
            # batch_y})

            ##############

            if visualization_experiment:
                """ initiative to write simpler code """

                if model == "RUM":
                    number_of_weights = (n_hidden + n_input) * \
                        n_hidden + n_hidden + n_input * n_hidden
                elif model in ["GORU", "EUNN"]:
                    # assuming that n_hidden is even.
                    number_of_weights = n_hidden - 1

                print(col("strating linear visualization", 'b'))
                num_points = 200

                coord, weights = generate_points_for_visualization(
                    number_of_weights, num_points)

                processed_placeholders = process_vis(weights,
                                                     num_points,
                                                     n_hidden=n_hidden,
                                                     cell=model)
                if model == "RUM":
                    feed_temp_target, feed_temp_target_bias, feed_temp_embed = processed_placeholders

                else:
                    feed_temp_theta0, feed_temp_theta1 = processed_placeholders

                collect_losses = []
                for i in range(num_points):
                    if model == "RUM":
                        loss = sess.run(cost,
                                        feed_dict={
                                            x:
                                            batch_x,
                                            y:
                                            batch_y,
                                            temp_target:
                                            feed_temp_target[i],
                                            temp_target_bias:
                                            feed_temp_target_bias[i],
                                            temp_embed:
                                            feed_temp_embed[i]
                                        })
                    elif model in ["EUNN", "GORU"]:
                        loss = sess.run(cost,
                                        feed_dict={
                                            x: batch_x,
                                            y: batch_y,
                                            temp_theta0: feed_temp_theta0[i],
                                            temp_theta1: feed_temp_theta1[i]
                                        })

                    print(col("iter: " + str(i) + " loss: " + str(loss), 'y'))
                    collect_losses.append(loss)
                np.save(os.path.join(save_path, "linear_height"),
                        np.array(collect_losses))
                np.save(os.path.join(save_path, "linear_coord"),
                        np.array(coord))
                print(col("done with linear visualization", 'b'))

                #####################

                print(col("strating contour visualization", 'b'))
                num_points = 20
                coord, weights = generate_points_for_visualization(
                    number_of_weights, num_points, type_vis="contour")
                np.save(os.path.join(save_path, "contour_coord"),
                        np.array(coord))
                processed_placeholders = process_vis(weights,
                                                     num_points**2,
                                                     n_hidden=n_hidden,
                                                     cell=model)
                if model == "RUM":
                    feed_temp_target, feed_temp_target_bias, feed_temp_embed = processed_placeholders
                else:
                    feed_temp_theta0, feed_temp_theta1 = processed_placeholders

                collect_contour = np.empty((num_points, num_points))
                for i in range(num_points):
                    for j in range(num_points):
                        if model == "RUM":
                            loss = sess.run(
                                cost,
                                feed_dict={
                                    x:
                                    batch_x,
                                    y:
                                    batch_y,
                                    temp_target:
                                    feed_temp_target[i * num_points + j],
                                    temp_target_bias:
                                    feed_temp_target_bias[i * num_points + j],
                                    temp_embed:
                                    feed_temp_embed[i * num_points + j]
                                })
                        elif model in ["GORU", "EUNN"]:
                            loss = sess.run(
                                cost,
                                feed_dict={
                                    x:
                                    batch_x,
                                    y:
                                    batch_y,
                                    temp_theta0:
                                    feed_temp_theta0[i * num_points + j],
                                    temp_theta1:
                                    feed_temp_theta1[i * num_points + j]
                                })
                        collect_contour[i, j] = loss
                        print(
                            col(
                                "iter: " + str(i) + "," + str(j) + " loss: " +
                                str(loss), 'y'))
                np.save(os.path.join(save_path, "contour_height"),
                        np.array(collect_contour))

                print(col("exiting visualization experiment", 'r'))
                exit()

            ##############

            acc, loss = sess.run([accuracy, cost],
                                 feed_dict={
                                     x: batch_x,
                                     y: batch_y
                                 })
            # writer.add_summary(costh_h, step) # RESEARCH RELATED

            print(
                col(
                    "Iter " + str(step) + ", Minibatch Loss= " +
                    "{:.6f}".format(loss) + ", Training Accuracy= " +
                    "{:.5f}".format(acc), 'g'))
            sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})

            steps.append(step)
            losses.append(loss)
            accs.append(acc)
            if step % 1000 == 0:
                summ = sess.run(merged_summary, feed_dict={x: val_x, y: val_y})
                acc = sess.run(accuracy, feed_dict={x: val_x, y: val_y})
                loss = sess.run(cost, feed_dict={x: val_x, y: val_y})
                train_writer.add_summary(summ, step)

                print("Validation Loss= " + "{:.6f}".format(loss) +
                      ", Validation Accuracy= " + "{:.5f}".format(acc))
                f.write(col("%d\t%f\t%f\n" % (step, loss, acc), 'y'))
                f.flush

            if step % 1000 == 1:
                print(col("saving graph and metadata in " + save_path, "b"))
                saver.save(sess, os.path.join(save_path, "model"))

            step += 1

        print(col("Optimization Finished!", 'b'))

        # test
        test_acc = sess.run(accuracy, feed_dict={x: test_x, y: test_y})
        test_loss = sess.run(cost, feed_dict={x: test_x, y: test_y})
        f.write(
            col(
                "Test result: Loss= " + "{:.6f}".format(test_loss) +
                ", Accuracy= " + "{:.5f}".format(test_acc), 'g'))

        f.close()
コード例 #4
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   lpc_targets=None,
                   stop_token_targets=None,
                   is_training=True):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          lpc_targets: float32 Tensor with shape [N, T_out, M], where M is feature dim
        '''
        with tf.variable_scope('inference'):
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            # [N, T_in, embed_depth=256]
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            # Encoder
            # [N, T_in, prenet_depths[-1]=128]
            prenet_outputs = prenet(embedded_inputs, is_training,
                                    hp.prenet_depths)
            # [N, T_in, encoder_depth=256]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training, hp.encoder_depth)

            # Location sensitive attention
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_depth,
                encoder_outputs)  # [N, T_in, attention_depth=256]

            # Decoder (layers specified bottom to top):
            multi_rnn_cell = MultiRNNCell(
                [
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Frames Projection layer
            frame_projection = FrameProjection(
                hp.num_lpcs * hp.outputs_per_step)  # [N, T_out/r, M*r]

            # <stop_token> projection layer
            stop_projection = StopProjection(
                is_training, shape=hp.outputs_per_step)  # [N, T_out/r, r]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            decoder_cell = TacotronDecoderWrapper(is_training,
                                                  attention_mechanism,
                                                  multi_rnn_cell,
                                                  frame_projection,
                                                  stop_projection)

            if is_training:
                helper = TacoTrainingHelper(inputs, lpc_targets, hp.num_lpcs,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_lpcs,
                                        hp.outputs_per_step)

            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            (decoder_outputs, stop_token_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 CustomDecoder(decoder_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            # [N, T_out, M]
            lpc_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_lpcs])
            stop_token_outputs = tf.reshape(stop_token_outputs,
                                            [batch_size, -1])  # [N, T_out, M]

            # # Add post-processing CBHG:
            # # [N, T_out, postnet_depth=256]
            # post_outputs = post_cbhg(
            #     , hp.num_mels, is_training, hp.postnet_depth)
            # # [N, T_out, F]
            # linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.stop_token_outputs = stop_token_outputs
            self.alignments = alignments
            self.lpc_outputs = lpc_outputs
            self.lpc_targets = lpc_targets
            self.stop_token_targets = stop_token_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               {}'.format(embedded_inputs.shape))
            log('  prenet out:              {}'.format(prenet_outputs.shape))
            log('  encoder out:             {}'.format(encoder_outputs.shape))
            log('  decoder out (r frames):  {}'.format(decoder_outputs.shape))
            log('  decoder out (1 frame):   {}'.format(lpc_outputs.shape))
            # log('  postnet out:             {}'.format(post_outputs.shape))
            # log('  linear out:              {}'.format(linear_outputs.shape))
            log('  stop token:              {}'.format(
                stop_token_outputs.shape))
コード例 #5
0
ファイル: tacotron.py プロジェクト: jeehyun100/si_tacotron
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):

        is_training2 = linear_targets is not None  # test에서 이게 True로 되는데, 이게 의도한 것인가???
        is_training = not rnn_decoder_test_mode

        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:  # speaker_embedding_size = f(16)
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(
                            speaker_id, self.num_speakers,
                            hp.enc_prenet_sizes[-1], "before_highway"
                        )  # 'enc_prenet_sizes': [f(256), f(128)]
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [
                            get_embed(
                                speaker_id, self.num_speakers, hp.dec_rnn_size,
                                "decoder_rnn_init_states{}".format(idx + 1))
                            for idx in range(hp.dec_layer_num)
                        ]
                    else:
                        deep_dense = lambda x, dim: tf.layers.dense(
                            x, dim, activation=tf.nn.softsign
                        )  # softsign: x / (abs(x) + 1)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다.
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type))
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(
                char_embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet'
            )  # 'enc_prenet_sizes': [f(256), f(128)],  dropout_prob = 0.5
            # ==> (N, T_in, 128)

            # enc_rnn_size = 128
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            # single: attention_size = 128
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_mon_norm_hccho':
                attention_mechanism = BahdanauMonotonicAttention_hccho(
                    hp.attention_size, encoder_outputs, normalize=True)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다.
            # carpedm20은  tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만,  keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다.
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_state_size),
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            dec_prenet_outputs = DecoderPrenetWrapper(
                attention_cell, speaker_embed, is_training,
                hp.dec_prenet_sizes,
                hp.dropout_prob)  # dec_prenet_sizes =  [f(256), f(128)]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]

            #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다.
            # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ]
            concat_cell = ConcatOutputAndAttentionWrapper(
                dec_prenet_outputs, embed_to_concat=speaker_embed
            )  # concat(output,attention,speaker_embed)해서 새로운 output을 만든다.

            # Decoder (layers specified bottom to top):  dec_rnn_size= 256
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)
                     ]  # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데...
            for _ in range(hp.dec_layer_num):  # hp.dec_layer_num = 2
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor
            )  # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까???   (hp.num_mels+1) * hp.reduction_factor
            decoder_init_state = output_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. )
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training2:
                # rnn_decoder_test_mode = True if test mode,  train mode에서는 False
                helper = TacoTrainingHelper(
                    inputs, mel_targets, hp.num_mels, hp.reduction_factor,
                    rnn_decoder_test_mode)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters)  # max_iters=200

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq)  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다.  ==> final_decoder_state[0]
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
コード例 #6
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ)
        JA = config.max_answer_length
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ)
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            # Char-CNN Embedding
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            # Word Embedding
            if config.use_word_emb:
                with tf.variable_scope("emb_var") as scope, tf.device(
                        "/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    tf.get_variable_scope().reuse_variables()
                    self.word_emb_scope = scope
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            [word_emb_mat, self.new_emb_mat], 0)

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                # Concat Char-CNN Embedding and Word Embedding
                if config.use_char_emb:
                    xx = tf.concat([xx, Ax], 3)  # [N, M, JX, di]
                    qq = tf.concat([qq, Aq], 2)  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

            # exact match
            if config.use_exact_match:
                emx = tf.expand_dims(tf.cast(self.emx, tf.float32), -1)
                xx = tf.concat([xx, emx], 3)  # [N, M, JX, di+1]
                emq = tf.expand_dims(tf.cast(self.emq, tf.float32), -1)
                qq = tf.concat([qq, emq], 2)  # [N, JQ, di+1]

        # 2 layer highway network on Concat Embedding
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        # Bidirection-LSTM (3rd layer on paper)
        cell = GRUCell(d) if config.GRU else BasicLSTMCell(d,
                                                           state_is_tuple=True)
        d_cell = SwitchableDropoutWrapper(
            cell, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]
        flat_x_len = flatten(x_len, 0)  # [N * M]

        with tf.variable_scope("prepro"):
            if config.use_fused_lstm:
                with tf.variable_scope("u1"):
                    fw_inputs = tf.transpose(
                        qq, [1, 0, 2])  #[time_len, batch_size, input_size]
                    bw_inputs = tf.reverse_sequence(fw_inputs,
                                                    q_len,
                                                    batch_dim=1,
                                                    seq_dim=0)
                    fw_inputs = tf.nn.dropout(fw_inputs,
                                              config.input_keep_prob)
                    bw_inputs = tf.nn.dropout(bw_inputs,
                                              config.input_keep_prob)
                    prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0)
                    prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0)
                    fw_outputs, fw_final = prep_fw_cell(fw_inputs,
                                                        dtype=tf.float32,
                                                        sequence_length=q_len,
                                                        scope="fw")
                    bw_outputs, bw_final = prep_bw_cell(bw_inputs,
                                                        dtype=tf.float32,
                                                        sequence_length=q_len,
                                                        scope="bw")
                    bw_outputs = tf.reverse_sequence(bw_outputs,
                                                     q_len,
                                                     batch_dim=1,
                                                     seq_dim=0)
                    current_inputs = tf.concat((fw_outputs, bw_outputs), 2)
                    output = tf.transpose(current_inputs, [1, 0, 2])
                    u = output
                flat_xx = flatten(xx, 2)  # [N * M, JX, d]
                if config.share_lstm_weights:
                    tf.get_variable_scope().reuse_variables()
                    with tf.variable_scope("u1"):
                        fw_inputs = tf.transpose(
                            flat_xx,
                            [1, 0, 2])  #[time_len, batch_size, input_size]
                        bw_inputs = tf.reverse_sequence(fw_inputs,
                                                        flat_x_len,
                                                        batch_dim=1,
                                                        seq_dim=0)
                        # fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob)
                        # bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob)
                        fw_outputs, fw_final = prep_fw_cell(
                            fw_inputs,
                            dtype=tf.float32,
                            sequence_length=flat_x_len,
                            scope="fw")
                        bw_outputs, bw_final = prep_bw_cell(
                            bw_inputs,
                            dtype=tf.float32,
                            sequence_length=flat_x_len,
                            scope="bw")
                        bw_outputs = tf.reverse_sequence(bw_outputs,
                                                         flat_x_len,
                                                         batch_dim=1,
                                                         seq_dim=0)
                        current_inputs = tf.concat((fw_outputs, bw_outputs), 2)
                        output = tf.transpose(current_inputs, [1, 0, 2])
                else:
                    with tf.variable_scope("h1"):
                        fw_inputs = tf.transpose(
                            flat_xx,
                            [1, 0, 2])  #[time_len, batch_size, input_size]
                        bw_inputs = tf.reverse_sequence(fw_inputs,
                                                        flat_x_len,
                                                        batch_dim=1,
                                                        seq_dim=0)
                        # fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob)
                        # bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob)
                        prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0)
                        prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0)
                        fw_outputs, fw_final = prep_fw_cell(
                            fw_inputs,
                            dtype=tf.float32,
                            sequence_length=flat_x_len,
                            scope="fw")
                        bw_outputs, bw_final = prep_bw_cell(
                            bw_inputs,
                            dtype=tf.float32,
                            sequence_length=flat_x_len,
                            scope="bw")
                        bw_outputs = tf.reverse_sequence(bw_outputs,
                                                         flat_x_len,
                                                         batch_dim=1,
                                                         seq_dim=0)
                        current_inputs = tf.concat((fw_outputs, bw_outputs), 2)
                        output = tf.transpose(current_inputs, [1, 0, 2])
                h = tf.expand_dims(output, 1)  # [N, M, JX, 2d]
            else:
                (fw_u, bw_u), _ = bidirectional_dynamic_rnn(
                    d_cell, d_cell, qq, q_len, dtype='float',
                    scope='u1')  # [N, J, d], [N, d]
                u = tf.concat([fw_u, bw_u], 2)
                if config.share_lstm_weights:
                    tf.get_variable_scope().reuse_variables()
                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                        cell, cell, xx, x_len, dtype='float',
                        scope='u1')  # [N, M, JX, 2d]
                    h = tf.concat([fw_h, bw_h], 3)  # [N, M, JX, 2d]
                else:
                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                        cell, cell, xx, x_len, dtype='float',
                        scope='h1')  # [N, M, JX, 2d]
                    h = tf.concat([fw_h, bw_h], 3)  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        # Attention Flow Layer (4th layer on paper)
        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell = AttentionCell(
                    cell,
                    u,
                    size=d,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(config,
                                     self.is_train,
                                     h,
                                     u,
                                     h_mask=self.x_mask,
                                     u_mask=self.q_mask,
                                     scope="p0",
                                     tensor_dict=self.tensor_dict)
                first_cell = d_cell
            tp0 = p0

        # Modeling layer (5th layer on paper)
        with tf.variable_scope('modeling_layer'):
            if config.use_fused_lstm:
                g1, encoder_state_final = build_fused_bidirectional_rnn(
                    inputs=p0,
                    num_units=config.hidden_size,
                    num_layers=config.num_modeling_layers,
                    inputs_length=flat_x_len,
                    input_keep_prob=config.input_keep_prob,
                    scope='modeling_layer_g')

            else:
                for layer_idx in range(config.num_modeling_layers - 1):
                    (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                        first_cell,
                        first_cell,
                        p0,
                        x_len,
                        dtype='float',
                        scope="g_{}".format(layer_idx))  # [N, M, JX, 2d]
                    p0 = tf.concat([fw_g0, bw_g0], 3)
                (fw_g1, bw_g1), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(
                    first_cell,
                    first_cell,
                    p0,
                    x_len,
                    dtype='float',
                    scope='g1')  # [N, M, JX, 2d]
                g1 = tf.concat([fw_g1, bw_g1], 3)  # [N, M, JX, 2d]

        # Self match layer
        if config.use_self_match:
            s0 = tf.reshape(g1, [N * M, JX, 2 * d])  # [N * M, JX, 2d]
            x_mask = tf.reshape(self.x_mask, [N * M, JX])  # [N * M, JX]
            if config.use_static_self_match:
                with tf.variable_scope(
                        "StaticSelfMatch"
                ):  # implemented follow r-net section 3.3
                    W_x_Vj = tf.contrib.layers.fully_connected(  # [N * M, JX, d]
                        s0,
                        int(d / 2),
                        scope='row_first',
                        activation_fn=None,
                        biases_initializer=None)
                    W_x_Vt = tf.contrib.layers.fully_connected(  # [N * M, JX, d]
                        s0,
                        int(d / 2),
                        scope='col_first',
                        activation_fn=None,
                        biases_initializer=None)
                    sum_rc = tf.add(  # [N * M, JX, JX, d]
                        tf.expand_dims(W_x_Vj, 1), tf.expand_dims(W_x_Vt, 2))
                    v = tf.get_variable('second',
                                        shape=[1, 1, 1, int(d / 2)],
                                        dtype=tf.float32)
                    Sj = tf.reduce_sum(tf.multiply(v, tf.tanh(sum_rc)),
                                       -1)  # [N * M, JX, JX]
                    Ai = softmax(Sj, mask=tf.expand_dims(x_mask,
                                                         1))  # [N * M, JX, JX]
                    Ai = tf.expand_dims(Ai, -1)  # [N * M, JX, JX, 1]
                    Vi = tf.expand_dims(s0, 1)  # [N * M, 1, JX, 2d]
                    Ct = tf.reduce_sum(  # [N * M, JX, 2d]
                        tf.multiply(Ai, Vi), axis=2)
                    inputs_Vt_Ct = tf.concat([s0, Ct], 2)  # [N * M, JX, 4d]
                    if config.use_fused_lstm:
                        fw_inputs = tf.transpose(
                            inputs_Vt_Ct,
                            [1, 0, 2])  # [time_len, batch_size, input_size]
                        bw_inputs = tf.reverse_sequence(fw_inputs,
                                                        flat_x_len,
                                                        batch_dim=1,
                                                        seq_dim=0)
                        fw_inputs = tf.nn.dropout(fw_inputs,
                                                  config.input_keep_prob)
                        bw_inputs = tf.nn.dropout(bw_inputs,
                                                  config.input_keep_prob)
                        prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0)
                        prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0)
                        fw_outputs, fw_s_f = prep_fw_cell(
                            fw_inputs,
                            dtype=tf.float32,
                            sequence_length=flat_x_len,
                            scope="fw")
                        bw_outputs, bw_s_f = prep_bw_cell(
                            bw_inputs,
                            dtype=tf.float32,
                            sequence_length=flat_x_len,
                            scope="bw")
                        fw_s_f = LSTMStateTuple(c=fw_s_f[0], h=fw_s_f[1])
                        bw_s_f = LSTMStateTuple(c=bw_s_f[0], h=bw_s_f[1])
                        bw_outputs = tf.reverse_sequence(bw_outputs,
                                                         flat_x_len,
                                                         batch_dim=1,
                                                         seq_dim=0)
                        current_inputs = tf.concat((fw_outputs, bw_outputs), 2)
                        s1 = tf.transpose(current_inputs, [1, 0, 2])
                    else:
                        (fw_s, bw_s), (fw_s_f,
                                       bw_s_f) = bidirectional_dynamic_rnn(
                                           first_cell,
                                           first_cell,
                                           inputs_Vt_Ct,
                                           flat_x_len,
                                           dtype='float',
                                           scope='s')  # [N, M, JX, 2d]
                        s1 = tf.concat([fw_s, bw_s],
                                       2)  # [N * M, JX, 2d], M == 1
            else:
                with tf.variable_scope("DynamicSelfMatch"):
                    first_cell = AttentionCell(cell,
                                               s0,
                                               size=d,
                                               mask=x_mask,
                                               is_train=self.is_train)
                    (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(
                        first_cell,
                        first_cell,
                        s0,
                        x_len,
                        dtype='float',
                        scope='s')  # [N, M, JX, 2d]
                    s1 = tf.concat([fw_s, bw_s], 2)  # [N * M, JX, 2d], M == 1
            g1 = tf.expand_dims(s1, 1)  # [N, M, JX, 2d]

        # prepare for PtrNet
        encoder_output = g1  # [N, M, JX, 2d]
        encoder_output = tf.expand_dims(tf.cast(self.x_mask, tf.float32),
                                        -1) * encoder_output  # [N, M, JX, 2d]

        if config.use_self_match or not config.use_fused_lstm:
            if config.GRU:
                encoder_state_final = tf.concat((fw_s_f, bw_s_f),
                                                1,
                                                name='encoder_concat')
            else:
                if isinstance(fw_s_f, LSTMStateTuple):
                    encoder_state_c = tf.concat((fw_s_f.c, bw_s_f.c),
                                                1,
                                                name='encoder_concat_c')
                    encoder_state_h = tf.concat((fw_s_f.h, bw_s_f.h),
                                                1,
                                                name='encoder_concat_h')
                    encoder_state_final = LSTMStateTuple(c=encoder_state_c,
                                                         h=encoder_state_h)
                elif isinstance(fw_s_f, tf.Tensor):
                    encoder_state_final = tf.concat((fw_s_f, bw_s_f),
                                                    1,
                                                    name='encoder_concat')
                else:
                    encoder_state_final = None
                    tf.logging.error("encoder_state_final not set")

        print("encoder_state_final:", encoder_state_final)

        with tf.variable_scope("output"):
            # eos_symbol = config.eos_symbol
            # next_symbol = config.next_symbol

            tf.assert_equal(
                M,
                1)  # currently dynamic M is not supported, thus we assume M==1
            answer_string = tf.placeholder(
                shape=(N, 1, JA + 1), dtype=tf.int32,
                name='answer_string')  # [N, M, JA + 1]
            answer_string_mask = tf.placeholder(
                shape=(N, 1, JA + 1), dtype=tf.bool,
                name='answer_string_mask')  # [N, M, JA + 1]
            answer_string_length = tf.placeholder(
                shape=(N, 1),
                dtype=tf.int32,
                name='answer_string_length',
            )  # [N, M]
            self.tensor_dict['answer_string'] = answer_string
            self.tensor_dict['answer_string_mask'] = answer_string_mask
            self.tensor_dict['answer_string_length'] = answer_string_length
            self.answer_string = answer_string
            self.answer_string_mask = answer_string_mask
            self.answer_string_length = answer_string_length

            answer_string_flattened = tf.reshape(answer_string,
                                                 [N * M, JA + 1])
            self.answer_string_flattened = answer_string_flattened  # [N * M, JA+1]
            print("answer_string_flattened:", answer_string_flattened)

            answer_string_length_flattened = tf.reshape(
                answer_string_length, [N * M])
            self.answer_string_length_flattened = answer_string_length_flattened  # [N * M]
            print("answer_string_length_flattened:",
                  answer_string_length_flattened)

            decoder_cell = GRUCell(2 * d) if config.GRU else BasicLSTMCell(
                2 * d, state_is_tuple=True)

            with tf.variable_scope("Decoder"):
                decoder_train_logits = ptr_decoder(
                    decoder_cell,
                    tf.reshape(tp0, [N * M, JX, 2 * d]),  # [N * M, JX, 2d]
                    tf.reshape(encoder_output,
                               [N * M, JX, 2 * d]),  # [N * M, JX, 2d]
                    flat_x_len,
                    encoder_final_state=encoder_state_final,
                    max_encoder_length=config.sent_size_th,
                    decoder_output_length=
                    answer_string_length_flattened,  # [N * M]
                    batch_size=N,  # N * M (M=1)
                    attention_proj_dim=self.config.decoder_proj_dim,
                    scope='ptr_decoder'
                )  # [batch_size, dec_len*, enc_seq_len + 1]

                self.decoder_train_logits = decoder_train_logits
                print("decoder_train_logits:", decoder_train_logits)
                self.decoder_train_softmax = tf.nn.softmax(
                    self.decoder_train_logits)
                self.decoder_inference = tf.argmax(
                    decoder_train_logits, axis=2,
                    name='decoder_inference')  # [N, JA + 1]

            self.yp = tf.ones([N, M, JX], dtype=tf.int32) * -1
            self.yp2 = tf.ones([N, M, JX], dtype=tf.int32) * -1
コード例 #7
0
ファイル: modules.py プロジェクト: MLCogUP/nspeech
def cbhg(inputs,
         input_lengths,
         activation=tf.nn.relu,
         speaker_embd=None,
         is_training=True,
         K=16,
         c=(128, 128),
         gru_units=128,
         num_highways=4,
         scope="cbhg"):
    with tf.variable_scope(scope):
        conv_bank = conv1d_banks(inputs,
                                 K=K,
                                 activation=activation,
                                 is_training=is_training)  # (N, T_x, K*E/2)

        # Maxpooling:
        conv_proj = tf.layers.max_pooling1d(conv_bank,
                                            pool_size=2,
                                            strides=1,
                                            padding='same')

        # Projection layers:
        for i, layer_size in enumerate(c[:-1]):
            conv_proj = conv1d(conv_bank, 3, layer_size, activation,
                               is_training, 'proj_{}'.format(i + 1))
        conv_proj = conv1d(conv_proj, 3, c[-1], None, is_training,
                           'proj_{}'.format(len(c)))

        # Residual connection:
        highway_input = conv_proj + inputs

        # Handle dimensionality mismatch:
        if highway_input.shape[2] != 128:
            highway_input = tf.layers.dense(highway_input, 128)

        # 4-layer HighwayNet:
        h = highway_input
        for i in range(num_highways):
            with tf.variable_scope('highway_' + str(i)):
                # site specific speaker embedding
                if speaker_embd is not None:
                    s = tf.layers.dense(speaker_embd,
                                        h.shape[-1],
                                        activation=tf.nn.softsign)
                    s = tf.tile(tf.expand_dims(s, 1), [1, tf.shape(h)[1], 1])
                    h = tf.concat([h, s], -1)
                h = highwaynet(h)

        # site specific speaker embedding
        if speaker_embd is not None:
            # TODO: what about two different s1, s2 for forwards and backwards
            s = tf.layers.dense(speaker_embd,
                                gru_units,
                                activation=tf.nn.softsign)
        else:
            s = None

        # Bidirectional RNN
        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            GRUCell(gru_units),
            GRUCell(gru_units),
            h,
            initial_state_fw=s,
            initial_state_bw=s,
            sequence_length=input_lengths,
            dtype=tf.float32)
        encoded = tf.concat(outputs, axis=2)  # Concat forward and backward

        return encoded
コード例 #8
0
    def __init__(self,
                 num_items,
                 num_embed_units,
                 num_units,
                 num_layers,
                 embed=None,
                 learning_rate=1e-4,
                 action_num=10,
                 learning_rate_decay_factor=0.95,
                 max_gradient_norm=5.0,
                 use_lstm=True):

        self.epoch = tf.Variable(0, trainable=False, name='agn/epoch')
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)

        self.sessions_input = tf.placeholder(tf.int32, shape=(None, None))
        self.rec_lists = tf.placeholder(tf.int32, shape=(None, None, None))
        self.rec_mask = tf.placeholder(tf.float32, shape=(None, None, None))
        self.aims_idx = tf.placeholder(tf.int32, shape=(None, None))
        self.sessions_length = tf.placeholder(tf.int32, shape=(None))
        self.reward = tf.placeholder(tf.float32, shape=(None))

        if embed is None:
            self.embed = tf.get_variable(
                'agn/embed', [num_items, num_embed_units],
                tf.float32,
                initializer=tf.truncated_normal_initializer(0, 1))
        else:
            self.embed = tf.get_variable('agn/embed',
                                         dtype=tf.float32,
                                         initializer=embed)

        batch_size, encoder_length, rec_length = tf.shape(
            self.sessions_input)[0], tf.shape(
                self.sessions_input)[1], tf.shape(self.rec_lists)[2]

        encoder_mask = tf.reshape(
            tf.cumsum(tf.one_hot(self.sessions_length - 2, encoder_length),
                      reverse=True,
                      axis=1), [-1, encoder_length])
        # [batch_size, length]
        self.sessions_target = tf.concat([
            self.sessions_input[:, 1:],
            tf.ones([batch_size, 1], dtype=tf.int32) * PAD_ID
        ], 1)
        # [batch_size, length, embed_units]
        self.encoder_input = tf.nn.embedding_lookup(self.embed,
                                                    self.sessions_input)
        # [batch_size, length, rec_length]
        self.aims = tf.one_hot(self.aims_idx, rec_length)

        if use_lstm:
            cell = MultiRNNCell(
                [LSTMCell(num_units) for _ in range(num_layers)])
        else:
            cell = MultiRNNCell(
                [GRUCell(num_units) for _ in range(num_layers)])

        # Training
        with tf.variable_scope("agn"):
            output_fn, sampled_sequence_loss = output_projection_layer(
                num_units, num_items)
            self.encoder_output, self.encoder_state = dynamic_rnn(
                cell,
                self.encoder_input,
                self.sessions_length,
                dtype=tf.float32,
                scope="encoder")

            tmp_dim_1 = tf.tile(
                tf.reshape(tf.range(batch_size), [batch_size, 1, 1, 1]),
                [1, encoder_length, rec_length, 1])
            tmp_dim_2 = tf.tile(
                tf.reshape(tf.range(encoder_length),
                           [1, encoder_length, 1, 1]),
                [batch_size, 1, rec_length, 1])
            # [batch_size, length, rec_length, 3]
            gather_idx = tf.concat(
                [tmp_dim_1, tmp_dim_2,
                 tf.expand_dims(self.rec_lists, 3)], 3)

            # [batch_size, length, num_items], [batch_size*length]
            y_prob, local_loss, total_size = sampled_sequence_loss(
                self.encoder_output, self.sessions_target, encoder_mask)

            # Compute recommendation rank given rec_list
            # [batch_size, length, num_items]
            y_prob = tf.reshape(y_prob, [batch_size, encoder_length, num_items]) * \
                tf.concat([tf.zeros([batch_size, encoder_length, 2], dtype=tf.float32),
                            tf.ones([batch_size, encoder_length, num_items-2], dtype=tf.float32)], 2)
            # [batch_size, length, rec_len]
            ini_prob = tf.reshape(tf.gather_nd(y_prob, gather_idx),
                                  [batch_size, encoder_length, rec_length])
            # [batch_size, length, rec_len]
            mul_prob = ini_prob * self.rec_mask

            # [batch_size, length, action_num]
            _, self.index = tf.nn.top_k(mul_prob, k=action_num)
            # [batch_size, length, metric_num]
            _, self.metric_index = tf.nn.top_k(mul_prob, k=(FLAGS.metric + 1))

            self.loss = tf.reduce_sum(
                tf.reshape(self.reward, [-1]) * local_loss) / total_size

        # Inference
        with tf.variable_scope("agn", reuse=True):
            # tf.get_variable_scope().reuse_variables()
            self.lstm_state = tf.placeholder(tf.float32,
                                             shape=(2, 2, None, num_units))
            self.ini_state = (tf.contrib.rnn.LSTMStateTuple(
                self.lstm_state[0, 0, :, :], self.lstm_state[0, 1, :, :]),
                              tf.contrib.rnn.LSTMStateTuple(
                                  self.lstm_state[1, 0, :, :],
                                  self.lstm_state[1, 1, :, :]))
            # [batch_size, length, num_units]
            self.encoder_output_predict, self.encoder_state_predict = dynamic_rnn(
                cell,
                self.encoder_input,
                self.sessions_length,
                initial_state=self.ini_state,
                dtype=tf.float32,
                scope="encoder")

            # [batch_size, num_units]
            self.final_output_predict = tf.reshape(
                self.encoder_output_predict[:, -1, :], [-1, num_units])
            # [batch_size, num_items]
            self.rec_logits = output_fn(self.final_output_predict)
            # [batch_size, action_num]
            _, self.rec_index = tf.nn.top_k(
                self.rec_logits[:, len(_START_VOCAB):], action_num)
            self.rec_index += len(_START_VOCAB)

            def gumbel_max(inp, alpha, beta):
                # assert len(tf.shape(inp)) == 2
                g = tf.random_uniform(tf.shape(inp), 0.0001, 0.9999)
                g = -tf.log(-tf.log(g))
                inp_g = tf.nn.softmax(
                    (tf.nn.log_softmax(inp / 1.0) + g * alpha) * beta)
                return inp_g

            # [batch_size, action_num]
            _, self.random_rec_index = tf.nn.top_k(
                gumbel_max(self.rec_logits[:, len(_START_VOCAB):], 1, 1),
                action_num)
            self.random_rec_index += len(_START_VOCAB)

        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=tf.float32)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)

        self.global_step = tf.Variable(0, trainable=False)
        self.params = tf.trainable_variables()
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = tf.train.AdamOptimizer(
            self.learning_rate).apply_gradients(zip(clipped_gradients,
                                                    self.params),
                                                global_step=self.global_step)

        self.saver = tf.train.Saver(tf.global_variables(),
                                    write_version=tf.train.SaverDef.V2,
                                    max_to_keep=100,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)
コード例 #9
0
    def initialize(self, txt_targets_A, txt_lenth_A, txt_targets_B, txt_lenth_B, mel_targets, image_targets):
        #with tf.variable_scope('inference') as scope:
        is_training = mel_targets is not None
        #is_teacher_force_generating = mel_targets is not None
        batch_size = tf.shape(mel_targets)[0]
        hp = self._hparams
        
        # Embeddings for text
        embedding_table = tf.get_variable(
          'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32,
          initializer=tf.truncated_normal_initializer(stddev=0.5))
        embedded_txt_inputs_A = tf.nn.embedding_lookup(embedding_table, txt_targets_A)            #[N, T_in, 128]
        embedded_txt_inputs_B = tf.nn.embedding_lookup(embedding_tabel, txt_targets_B)
        
        
    #------------------------ Encoder Scope----------------------------------------------
        # 'e space': outputs from Modality Encoders

        # Text Encoder
        with tf.variable_scope('E_text', reuse = tf.AUTO_REUSE) as scope: 
            prenet_outputs_A = prenet(embedded_txt_inputs_A, is_training)                       # [N, T_in, 128]
            prenet_outputs_B = prenet(embedded_txt_inputs_B, is_training)                       # [N, T_in, 128]
        
            cbhg_outputs_A = encoder_cbhg(prenet_outputs_A, input_lengths_A, is_training)  
            cbhg_outputs_B = encoder_cbhg(prenet_outputs_B, input_lengths_B, is_training)  
            
            txt_encoder_outputs_A = text_encoder(cbhg_outputs_A, is_training)
            txt_encoder_outputs_B = text_encoder(cbhg_outputs_B, is_training)

            self.e_txt_A = txt_encoder_outputs_A
            self.e_txt_B = txt_encoder_outputs_B

        # Speech Encoder 
        with tf.variable_scope('E_speech', reuse = tf.AUTO_REUSE) as scope:
            speech_outputs = reference_encoder(
                mel_targets, 
                filters=hp.reference_filters, 
                kernel_size=(3,3),
                strides=(2,2),
                encoder_cell=GRUCell(hp.reference_depth),
                is_training=is_training)                                                 # [N, 256]
            self.e_speech = speech_outputs                                       

        # Image Encoder
        with tf.variable_scope('E_image', reuse = tf.AUTO_REUSE) as scope:
            img_outputs = image_encoder( 
                is_training=is_training,
                norm='batch',
                image_size = 128)
            self.e_img = img_outputs  
    
     #-------------------------Universal Computing Body------------------------------------

        # Modality Transformer T
        with tf.variable_scope('T', reuse = tf.AUTO_REUSE) as scope:
            # 'z space': output from Modality Transformer
            self.z_img = modality_transformer(self.e_img, is_training = is_training)
            self.z_txt_A = modality_transformer(self.e_txt_A, is_training = is_training)
            self.z_txt_B = modality_transformer(self.e_txt_B, is_training = is_training)
            self.z_speech = modality_transformer(self.e_speech, is_training = is_training)

        # Modality Classifier C
        with tf.variable_scope('C', reuse = tf.AUTO_REUSE) as scope:
            self.c_logit_img = modality_classifier(self.z_img, is_training = is_training)
            c_logit_txt_A = modality_classifier(self.z_txt_A, is_training = is_training)
            c_logit_txt_B = modality_classifier(self.z_txt_B, is_training = is_training)
            self.c_logit_txt = c_logit_txt_A + c_logit_txt_B
            self.c_logit_speech = modality_classifier(self.z_speech, is_training =is_training)
             
        # Memory Fusion Module M
        with tf.variable_scope('M', reuse = tf.AUDO_REUSE) as scope:
            # Global tokens
            tokens = tf.get_variable(
            'global_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32,
            initializer=tf.truncated_normal_initializer(stddev=0.5))
            self.tokens = tokens
                
          # Multi-head Attention
            attention_img = MultiheadAttention(
            tf.expand_dims(self.z_img,axis=1),                                   # [N, 1, 256]
            tf.tanh(tf.tile(tf.expand_dims(tokens, axis=0), [batch_size,1,1])),            # [N, hp.num_gst, 256/hp.num_heads]   
            num_heads=hp.num_heads,
            num_units=hp.style_att_dim,
            attention_type=hp.style_att_type)

            attention_speech = MultiheadAttention(
            tf.expand_dims(self.z_speech,axis=1),                                   # [N, 1, 256]
            tf.tanh(tf.tile(tf.expand_dims(tokens, axis=0), [batch_size,1,1])),            # [N, hp.num_gst, 256/hp.num_heads]   
            num_heads=hp.num_heads,
            num_units=hp.style_att_dim,
            attention_type=hp.style_att_type)

            attention_txt_A = MultiheadAttention(
            tf.expand_dims(self.z_txt_A, axis=1),                                
            tf.tanh(tf.tile(tf.expand_dims(tokens, axis=0), [batch_size,1,1])),            # [N, hp.num_gst, 256/hp.num_heads]   
            num_heads=hp.num_heads,
            num_units=hp.style_att_dim,
            attention_type=hp.style_att_type)

            attention_txt_B = MultiheadAttention(
            tf.expand_dims(self.z_txt_B, axis=1),                   
            tf.tanh(tf.tile(tf.expand_dims(tokens, axis=0), [batch_size,1,1])),            # [N, hp.num_gst, 256/hp.num_heads]   
            num_heads=hp.num_heads,
            num_units=hp.style_att_dim,
            attention_type=hp.style_att_type)

            output_img = attention_img.multi_head_attention()                   # [N, 1, 256]
            output_txt_A = attention_txt_A.multi_head_attention()
            output_txt_B = attention_txt_B.multi_head_attention()
            output_speech = attention_speech.multi_head_attention()

            # 'u space': output form Memory Fusion Module
            self.u_img = output_img
            self.u_speech = output_speech
            self.u_txt_A = output_txt_A
            self.u_txt_B = output_txt_B   

        
        #---------------Decoder Scopt---------------------------------------------------------            
           
        # Image Decoder scope
        with tf.variable_scope('D_img') as scope:
            fake_img = image_decoder( self.u_img, is_train = self.is_training)
            self.fake_img = fake_img

   
        # Speech Decoder scope
        with tf.variable_scope('D_speech') as scope: 
            # Attention
            attention_cell = AttentionWrapper(
              GRUCell(hp.attention_depth),
              BahdanauAttention(hp.attention_depth, self.u_speech, memory_sequence_length=input_lengths),
              alignment_history=True,
              output_attention=False)                                                  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)              

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, hp.rnn_depth),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)),
                ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1))
              ], state_is_tuple=True)                                                  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

            if is_training:
              helper = TacoTrainingHelper(inputs, mel_targets, hp)
            
            (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
              BasicDecoder(output_cell, helper, decoder_init_state),
              maximum_iterations=hp.max_iters)                                        # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            fake_mel = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M]
            self.fake_mel = fake_mel

          
        self.txt_targets_A = txt_targets_A
        self.txt_lengths_B = txt_lengths_B
        self.mel_targets = mel_targets
        self.image_targets = image_targets
コード例 #10
0
ファイル: tacotron.py プロジェクト: qingyundou/tacotron_qdou
    def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None,
                   gta=False, locked_alignments=None):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
        '''
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments

        if locked_alignments_ is not None:
            if np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(embedded_inputs, is_training, hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training,  # [N, T_in, encoder_depth=256]
                                           hp.encoder_depth)

            # Attention
            attention_mechanism = BahdanauAttention(hp.attention_depth, encoder_outputs)

            attention_cell = LockableAttentionWrapper(
                GRUCell(hp.attention_depth),
                attention_mechanism,
                alignment_history=True,
                locked_alignments=locked_alignments_,
                output_attention=False,
                name='attention_wrapper')  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            prenet_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(prenet_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                ResidualWrapper(GRUCell(hp.decoder_depth)),
                ResidualWrapper(GRUCell(hp.decoder_depth))
            ], state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

            (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                BasicDecoder(output_cell, helper, decoder_init_state),
                maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training,  # [N, T_out, postnet_depth=256]
                                     hp.postnet_depth)
            linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            #   original shape is: (decoder_steps, time_steps, encoder_steps)
            #   end shape is: (time_steps, encoder_steps, decoder_steps)
            alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.attention_mechanism = attention_mechanism
            self.attention_cell = attention_cell
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
コード例 #11
0
if __name__ == '__main__':
  try:
    from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple, GRUCell
  except ImportError:
    LSTMCell = tf.nn.rnn_cell.LSTMCell
    LSTMStateTuple = tf.nn.rnn_cell.LSTMStateTuple
    GRUCell = tf.nn.rnn_cell.GRUCell

  tf.reset_default_graph()
  with tf.Session() as session:
    model = HANClassifierModel(
      vocab_size=10,
      embedding_size=5,
      classes=2,
      fw_word_cell=GRUCell(10),
      bw_word_cell=GRUCell(10),
      fw_sentence_cell=GRUCell(10),
      bw_sentence_cell=GRUCell(10),
      word_output_size=10,
      sentence_output_size=10,
      max_grad_norm=5.0,
      dropout_keep_proba=0.5,
    )
    session.run(tf.global_variables_initializer())

    fd = {
      model.is_training: False,
      model.inputs: [[
        [5, 4, 1, 0],
        [3, 3, 6, 7],
コード例 #12
0
X_test = zero_pad(X_test, SEQUENCE_LENGTH)

# Different placeholders
batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH])
target_ph = tf.placeholder(tf.float32, [None])
seq_len_ph = tf.placeholder(tf.int32, [None])
keep_prob_ph = tf.placeholder(tf.float32)

# Embedding layer
embeddings_var = tf.Variable(tf.random_uniform(
    [vocabulary_size, EMBEDDING_DIM], -1.0, 1.0),
                             trainable=True)
batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)

# (Bi-)RNN layer(-s)
rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(GRUCell(HIDDEN_SIZE),
                                                 GRUCell(HIDDEN_SIZE),
                                                 inputs=batch_embedded,
                                                 sequence_length=seq_len_ph,
                                                 dtype=tf.float32)

# Attention layer
attention_output, alphas = attention(rnn_outputs,
                                     ATTENTION_SIZE,
                                     return_alphas=True)

# Dropout
drop = tf.nn.dropout(attention_output, keep_prob_ph)

# Fully connected layer
W = tf.Variable(tf.truncated_normal(
コード例 #13
0
    def presentation_transformer(self, inputs, inputs_actual_length):
        with tf.variable_scope('presentation_layer', reuse=tf.AUTO_REUSE):
            with tf.name_scope('structure_presentation_layer'):
                # 正向
                fw_cell = GRUCell(num_units=self.hidden_num)
                fw_drop_cell = DropoutWrapper(fw_cell,
                                              output_keep_prob=self.dropout)
                # 反向
                bw_cell = GRUCell(num_units=self.hidden_num)
                bw_drop_cell = DropoutWrapper(bw_cell,
                                              output_keep_prob=self.dropout)

                # 动态rnn函数传入的是一个三维张量,[batch_size,n_steps,n_input]  输出是一个元组 每一个元素也是这种形状
                if self.is_train and not self.is_extract:
                    output, _ = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw=fw_drop_cell,
                        cell_bw=bw_drop_cell,
                        inputs=inputs,
                        sequence_length=inputs_actual_length,
                        dtype=tf.float32)
                else:
                    output, _ = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw=fw_cell,
                        cell_bw=bw_cell,
                        inputs=inputs,
                        sequence_length=inputs_actual_length,
                        dtype=tf.float32)

                # hiddens的长度为2,其中每一个元素代表一个方向的隐藏状态序列,将每一时刻的输出合并成一个输出
                structure_output = tf.concat(output, axis=2)
                structure_output = self.layer_normalization(structure_output)

            with tf.name_scope('transformer_layer'):
                transformer_output = self.encoder_stack(
                    structure_output, self.is_train)

            with tf.name_scope('global_attention_layer'):
                w_omega = tf.get_variable(
                    name='w_omega',
                    shape=[self.hidden_num * 2, self.attention_num],
                    initializer=tf.random_normal_initializer())
                b_omega = tf.get_variable(
                    name='b_omega',
                    shape=[self.attention_num],
                    initializer=tf.random_normal_initializer())
                u_omega = tf.get_variable(
                    name='u_omega',
                    shape=[self.attention_num],
                    initializer=tf.random_normal_initializer())

                v = tf.tanh(
                    tf.tensordot(transformer_output, w_omega, axes=1) +
                    b_omega)

                vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
                alphas = tf.nn.softmax(vu, name='alphas')  # (B,T) shape

                # tf.expand_dims用于在指定维度增加一维
                global_attention_output = tf.reduce_sum(
                    transformer_output * tf.expand_dims(alphas, -1), 1)

        return global_attention_output
コード例 #14
0
ファイル: mainMulti.py プロジェクト: ZRZn/sst_classify
input_x = tf.placeholder(tf.int32, [BATCH_SIZE, None])
input_y = tf.placeholder(tf.int32, [BATCH_SIZE, Y_Class])
input_s = tf.placeholder(tf.int32, [BATCH_SIZE, None, SEN_CLASS])
sen_len_ph = tf.placeholder(tf.int32)
keep_prob_ph = tf.placeholder(tf.float32)

#Embedding Layer
emd_file = open(all_path + "emb_array.pkl", "rb")
emb_array = pickle.load(emd_file)
emd_file.close()
embeddings = tf.Variable(emb_array, trainable=True)
input_emd = tf.nn.embedding_lookup(embeddings, input_x)  #shape= (B, None, E)

#normal bi_GRU
(f_out, b_out), _ = bi_rnn(GRUCell(HIDDEN_SIZE),
                           GRUCell(HIDDEN_SIZE),
                           input_emd,
                           sequence_length=length(input_emd),
                           dtype=tf.float32)
gru_out = tf.concat((f_out, b_out), axis=2)

#RNN
# gru_out, _ = dynamic_rnn(BasicRNNCell(HIDDEN_SIZE), input_emd, sequence_length=length(input_emd), dtype=tf.float32)

#Attention Layer
# attention_output, alphas = attentionMulti(gru_out, ATTENTION_SIZE, input_s, BATCH_SIZE, sen_len_ph)

attention_output, w_a, b_omega, u_omega = attention(gru_out, ATTENTION_SIZE)

hidden_size = input_emd.shape[2].value
コード例 #15
0
def p_cbhg(inputs, input_lengths, is_training, scope, K, projections, depth):
    """
    Args:
        inputs: input tensor
        input_lengths: length of input tensor
        is_training: Batch Normalization option in Conv1D
        scope: network or model name
        K: kernel size range
        projections: projection layers option
        depth: dimensionality option of Highway net and Bidirectical GRU's output
    The layers in the code are staked in the order in which they came out.
    """
    with tf.variable_scope(scope):
        with tf.variable_scope('p_conv_bank'):

            conv_outputs = tf.concat(
                [
                    conv1d(inputs, k, 128, tf.nn.relu, is_training,
                           'p_conv1d_%d' % k) for k in range(1, K + 1)
                ],  #1D Convolution layers using multiple types of Convolution Kernel.
                axis=-1  #Iterate K with increasing filter size by 1.
            )  # Convolution bank: concatenate on the last axis to stack channels from all convolutions

        # Maxpooling:
        maxpool_output = tf.layers.max_pooling1d(
            conv_outputs, pool_size=2, strides=1,
            padding='same')  #1D Maxpooling layer(strides=1, width=2)

        # Two projection layers:
        proj1_output = conv1d(maxpool_output, 3, projections[0], tf.nn.relu,
                              is_training, 'p_proj_1')  #1st Conv1D projections
        proj2_output = conv1d(proj1_output, 3, projections[1], None,
                              is_training, 'p_proj_2')  #2nd Conv1D projections

        # Residual connection:
        highway_input = proj2_output + inputs  #Highway net input with residual connection

        half_depth = depth // 2
        assert half_depth * 2 == depth, 'encoder and postnet depths must be even.'  #assert depth to be even

        # Handle dimensionality mismatch:
        if highway_input.shape[
                2] != half_depth:  #check input's dimensionality and output's dimensionality are the same
            highway_input = tf.layers.dense(
                highway_input, half_depth
            )  #change input's channel size to Highway net output's  size

        # 4-layer HighwayNet:
        for i in range(4):
            highway_input = highwaynet(highway_input, 'p_highway_%d' % (i + 1),
                                       half_depth)  #make 4 Highway net layers
        rnn_input = highway_input

        # Bidirectional GRU
        outputs, states = tf.nn.bidirectional_dynamic_rnn(  #make Bidirectional GRU
            GRUCell(half_depth),
            GRUCell(half_depth),
            rnn_input,
            sequence_length=input_lengths,
            dtype=tf.float32)
        return tf.concat(
            outputs, axis=2)  # Concat forward sequence and backward sequence
コード例 #16
0
ファイル: affine_bilstm.py プロジェクト: jiyeon5/Fake_news
    def __init__(self,
                 sequence_length_head,
                 sequence_length_body,
                 num_classes,
                 vocab_size_head,
                 vocab_size_body,
                 embedding_size,
                 filter_sizes,
                 num_filters,
                 l2_reg_lambda=0.1):

        self.input_y = tf.placeholder(tf.float32, [None, num_classes],
                                      name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name="dropout_keep_prob")
        self.input_x_head = tf.placeholder(tf.int32,
                                           [None, sequence_length_head],
                                           name="input_x_head")
        self.input_x_body = tf.placeholder(tf.int32,
                                           [None, sequence_length_body],
                                           name="input_x_body")

        # Embedding layer
        self.embeddings_head = tf.Variable(tf.random_uniform(
            [vocab_size_head, embedding_size], -1.0, 1.0),
                                           trainable=False)  #trainable=false
        self.embedded_chars_head = tf.nn.embedding_lookup(
            self.embeddings_head, self.input_x_head)
        self.embedded_chars_expanded_head = tf.expand_dims(
            self.embedded_chars_head, -1)

        self.embeddings_body = tf.Variable(tf.random_uniform(
            [vocab_size_body, embedding_size], -1.0, 1.0),
                                           trainable=False)  #trainable=false
        self.embedded_chars_body = tf.nn.embedding_lookup(
            self.embeddings_body, self.input_x_body)
        self.embedded_chars_expanded_body = tf.expand_dims(
            self.embedded_chars_body, -1)

        #2. LSTM LAYER ######################################################################
        with tf.variable_scope("lstm-head") as scope:
            #self.lstm_cell_head = tf.contrib.rnn.LSTMCell(embedding_size,state_is_tuple=True)
            #self.lstm_out_head,self.lstm_state_head = tf.nn.dynamic_rnn(self.lstm_cell_head,self.embedded_chars_head,dtype=tf.float32)
            #self.lstm_out_expanded_head = tf.expand_dims(self.lstm_out_head, -1)
            self.lstm_out_head, self.lstm_state_head = bi_rnn(
                GRUCell(embedding_size),
                GRUCell(embedding_size),
                inputs=self.embedded_chars_head,
                dtype=tf.float32)
            self.lstm_out_merge_head = tf.concat(self.lstm_out_head, axis=2)
            #self.lstm_out_head_fw = self.lstm_out_head[0]
            #self.lstm_out_head_bw = self.lstm_out_head[1]
            #self.lstm_out_merge_head = tf.concat([self.lstm_out_head_fw[-1], self.lstm_out_head_bw[-1]], axis=1)
            self.lstm_out_expanded_head = tf.expand_dims(
                self.lstm_out_merge_head, -1)
            print(self.lstm_out_expanded_head.shape)

#output = tf.stack(output, axis=1)
#output = tf.reshape(output, [-1, FLAGS.num_units * 2])

        with tf.variable_scope("lstm-body") as scope:
            #self.lstm_cell_body = tf.contrib.rnn.LSTMCell(embedding_size,state_is_tuple=True)
            #self.lstm_out_body,self.lstm_state_body = tf.nn.dynamic_rnn(self.lstm_cell_body,self.embedded_chars_body,dtype=tf.float32)
            #self.lstm_out_expanded_body = tf.expand_dims(self.lstm_out_body, -1)
            self.lstm_out_body, self.lstm_state_body = bi_rnn(
                GRUCell(embedding_size),
                GRUCell(embedding_size),
                inputs=self.embedded_chars_body,
                dtype=tf.float32)
            self.lstm_out_merge_body = tf.concat(self.lstm_out_body, axis=2)
            #self.lstm_out_body_fw = self.lstm_out_body[0]
            #self.lstm_out_body_bw = self.lstm_out_body[1]
            #self.lstm_out_merge_body = tf.concat([self.lstm_out_body_fw[-1], self.lstm_out_body_bw[-1]], axis=1)
            self.lstm_out_expanded_body = tf.expand_dims(
                self.lstm_out_merge_body, -1)

            print(self.lstm_out_expanded_body.shape)

        self.pooled_outputs_head = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-head-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size * 2, 1, 256]
                W_head = tf.Variable(tf.truncated_normal(filter_shape,
                                                         stddev=0.1),
                                     name="W_head")
                b_head = tf.Variable(tf.constant(0.1, shape=[256]),
                                     name="b_head")
                conv_head = tf.nn.conv2d(self.lstm_out_expanded_head,
                                         W_head,
                                         strides=[1, 1, 1, 1],
                                         padding="VALID",
                                         name="conv")
                # Apply nonlinearity
                h_head = tf.nn.relu(tf.nn.bias_add(conv_head, b_head),
                                    name="relu_head")
                # Maxpooling over the outputs
                pooled_head = tf.nn.max_pool(
                    h_head,
                    ksize=[1, sequence_length_head - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                self.pooled_outputs_head.append(pooled_head)

        self.pooled_outputs_body = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-body-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size * 2, 1, 1024]
                W_body = tf.Variable(tf.truncated_normal(filter_shape,
                                                         stddev=0.1),
                                     name="W_body")
                b_body = tf.Variable(tf.constant(0.1, shape=[1024]),
                                     name="b_body")
                conv_body = tf.nn.conv2d(self.lstm_out_expanded_body,
                                         W_body,
                                         strides=[1, 1, 1, 1],
                                         padding="VALID",
                                         name="conv")
                # Apply nonlinearity
                h_body = tf.nn.relu(tf.nn.bias_add(conv_body, b_body),
                                    name="relu_body")
                # Maxpooling over the outputs
                pooled_body = tf.nn.max_pool(
                    h_body,
                    ksize=[1, sequence_length_body - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                self.pooled_outputs_body.append(pooled_body)

        l2_loss = tf.constant(0.0)

        pooled_outputs = tf.concat(
            [self.pooled_outputs_head, self.pooled_outputs_body],
            -1,
            name='preconcat')
        print(pooled_outputs.shape)
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3, name='concat')
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        W_fc1 = tf.Variable(tf.truncated_normal([1280, 1024], stddev=0.1),
                            name="W_fc1")
        b_fc1 = tf.Variable(tf.constant(0.1, shape=[1024]), name="b_fc1")
        h_fc1 = tf.nn.relu(tf.matmul(self.h_pool_flat, W_fc1) + b_fc1)

        W_fc2 = tf.Variable(tf.truncated_normal([1024, 1024], stddev=0.1),
                            name="W_fc1")
        b_fc2 = tf.Variable(tf.constant(0.1, shape=[1024]), name="b_fc1")
        h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)

        W_fc3 = tf.Variable(tf.truncated_normal([1024, 1024], stddev=0.1),
                            name="W_fc1")
        b_fc3 = tf.Variable(tf.constant(0.1, shape=[1024]), name="b_fc1")
        h_fc3 = tf.nn.relu(tf.matmul(h_fc2, W_fc3) + b_fc3)

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(h_fc3, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            self.W = tf.get_variable(
                "W",
                shape=[1024, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            self.b = tf.Variable(tf.constant(0.1, shape=[num_classes]),
                                 name="b")
            l2_loss += tf.nn.l2_loss(self.W)
            l2_loss += tf.nn.l2_loss(self.b)
            self.scores = tf.nn.xw_plus_b(self.h_drop,
                                          self.W,
                                          self.b,
                                          name="scores")
            self.probabilities = tf.nn.softmax(self.scores)
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # CalculateMean cross-entropy loss
        with tf.name_scope("loss"):
            print(self.scores.shape)
            losses = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            print("%d/%d", self.predictions, self.input_y)
            correct_predictions = tf.equal(self.predictions,
                                           tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                   "float"),
                                           name="accuracy")
コード例 #17
0
        with tf.name_scope('Inputs'):
            batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH],
                                      name='batch_ph')
            target_ph = tf.placeholder(tf.float32, [None], name='target_ph')
            seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')

        # Embedding layer
        with tf.name_scope('Embedding_layer'):
            embeddings_var = tf.Variable(tf.random_uniform(
                [vocabulary_size, EMBEDDING_DIM], -1.0, 1.0),
                                         trainable=True)
            tf.summary.histogram('embeddings_var', embeddings_var)
            batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)

        # (Bi-)RNN layer(-s)
        rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE),
                                GRUCell(HIDDEN_SIZE),
                                inputs=batch_embedded,
                                sequence_length=seq_len_ph,
                                dtype=tf.float32)
        tf.summary.histogram('RNN_outputs', rnn_outputs)
        rnn_outputs_cat = tf.concat(rnn_outputs, 2)
        # Attention layer
        with tf.name_scope('Attention_layer'):
            attention_output, alphas = attention(rnn_outputs,
                                                 ATTENTION_SIZE,
                                                 return_alphas=True)
            tf.summary.histogram('alphas', alphas)

        # Dropout
        drop = tf.nn.dropout(attention_output, KEEP_PROB)
コード例 #18
0
    def __init__(self, config, name_scope, forward_only=False, num_samples=512, dtype=tf.float32):

        # self.scope_name = scope_name
        # with tf.variable_scope(self.scope_name):
        source_vocab_size = config.vocab_size
        target_vocab_size = config.vocab_size
        emb_dim = config.emb_dim

        self.buckets = config.buckets
        self.learning_rate = tf.Variable(float(config.learning_rate), trainable=False, dtype=dtype)
        self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * config.learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)
        self.batch_size = config.batch_size
        self.num_layers = config.num_layers
        self.max_gradient_norm = config.max_gradient_norm
        self.mc_search = tf.placeholder(tf.bool, name="mc_search")
        self.forward_only = tf.placeholder(tf.bool, name="forward_only")
        self.up_reward = tf.placeholder(tf.bool, name="up_reward")
        self.reward_bias = tf.get_variable("reward_bias", [1], dtype=tf.float32)
        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < target_vocab_size:
            w_t = tf.get_variable("proj_w", [target_vocab_size, emb_dim], dtype=dtype)
            w = tf.transpose(w_t)
            b = tf.get_variable("proj_b", [target_vocab_size], dtype=dtype)
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                labels = tf.reshape(labels, [-1, 1])
                # We need to compute the sampled_softmax_loss using 32bit floats to
                # avoid numerical instabilities.
                local_w_t = tf.cast(w_t, tf.float32)
                local_b = tf.cast(b, tf.float32)
                local_inputs = tf.cast(inputs, tf.float32)
                return tf.cast(
                    tf.nn.sampled_softmax_loss(weights=local_w_t, biases=local_b, inputs=local_inputs, labels=labels,
                                               num_sampled=num_samples, num_classes=target_vocab_size), dtype)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = GRUCell(emb_dim)
        cell = single_cell
        if self.num_layers > 1:
            cell = MultiRNNCell([single_cell] * self.num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return rl_seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=source_vocab_size,
                num_decoder_symbols=target_vocab_size,
                embedding_size=emb_dim,
                output_projection=output_projection,
                feed_previous=do_decode,
                mc_search=self.mc_search,
                dtype=dtype)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(self.buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))
        for i in xrange(self.buckets[-1][1] + 1):
            self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i)))
            self.target_weights.append(tf.placeholder(dtype, shape=[None], name="weight{0}".format(i)))
        self.reward = [tf.placeholder(tf.float32, name="reward_%i" % i) for i in range(len(self.buckets))]

        # Our targets are decoder inputs shifted by one.
        targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)]

        self.outputs, self.losses, self.encoder_state = rl_seq2seq.model_with_buckets(
            self.encoder_inputs, self.decoder_inputs, targets, self.target_weights,
            self.buckets, source_vocab_size, self.batch_size,
            lambda x, y: seq2seq_f(x, y, tf.where(self.forward_only, True, False)),
            output_projection=output_projection, softmax_loss_function=softmax_loss_function)

        for b in xrange(len(self.buckets)):
            self.outputs[b] = [
                tf.cond(
                    self.forward_only,
                    lambda: tf.matmul(output, output_projection[0]) + output_projection[1],
                    lambda: output
                )
                for output in self.outputs[b]
            ]

        if not forward_only:
            with tf.name_scope("gradient_descent"):
                self.gradient_norms = []
                self.updates = []
                self.aj_losses = []
                self.gen_params = [p for p in tf.trainable_variables() if name_scope in p.name]
                # opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                opt = tf.train.AdamOptimizer()
                for b in xrange(len(self.buckets)):
                    R = tf.subtract(self.reward[b], self.reward_bias)
                    # self.reward[b] = self.reward[b] - reward_bias
                    adjusted_loss = tf.cond(self.up_reward,
                                            lambda: tf.subtract(self.losses[b], self.reward[b]),
                                            lambda: self.losses[b])

                    # adjusted_loss =  tf.cond(self.up_reward,
                    #                           lambda: tf.mul(self.losses[b], R),
                    #                           lambda: self.losses[b])
                    self.aj_losses.append(adjusted_loss)
                    gradients = tf.gradients(adjusted_loss, self.gen_params)
                    clipped_gradients, norm = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
                    self.gradient_norms.append(norm)
                    self.updates.append(opt.apply_gradients(
                        zip(clipped_gradients, self.gen_params), global_step=self.global_step))

        self.gen_variables = [k for k in tf.global_variables() if name_scope in k.name]
        self.saver = tf.train.Saver(self.gen_variables)
コード例 #19
0
ファイル: layers.py プロジェクト: dujifish/mrc-cbt
 def __init__(self, hidden_size, num_layers, name):
     self._cell = GRUCell(num_units=hidden_size)
     self._hidden_size = hidden_size
     self._num_layers = num_layers
     self.name = name
コード例 #20
0
    def __init__(self,
                 word_dict,
                 embedding_matrix,
                 d_len,
                 q_len,
                 sess,
                 embedding_dim,
                 hidden_size,
                 num_layers,
                 weight_path,
                 use_lstm=False):
        """
        初始化模型
        b ... batch_size
        t ... d_len
        f ... hidden_size*2
        i ... candidate_len 
        """
        self.weight_path = weight_path
        self.word_dict = word_dict
        self.vocab_size = len(embedding_matrix)
        self.d_len = d_len
        self.q_len = q_len
        self.sess = sess
        self.A_len = 10

        logging.info("Embedding matrix shape:%d x %d" %
                     (len(embedding_matrix), embedding_dim))

        self.rnn_cell = LSTMCell(
            num_units=hidden_size, ) if use_lstm else GRUCell(
                num_units=hidden_size)
        self.cell_name = "LSTM" if use_lstm else "GRU"

        # 声明词向量矩阵
        with tf.device("/cpu:0"):
            embedding = tf.Variable(initial_value=embedding_matrix,
                                    name="embedding_matrix_w",
                                    dtype="float32")
        # 模型的输入输出
        self.q_input = tf.placeholder(dtype=tf.int32,
                                      shape=(None, self.q_len),
                                      name="q_input")
        self.d_input = tf.placeholder(dtype=tf.int32,
                                      shape=(None, self.d_len),
                                      name="d_input")
        self.context_mask_bt = tf.placeholder(dtype=tf.float32,
                                              shape=(None, self.d_len),
                                              name="context_mask_bt")
        self.candidates_bi = tf.placeholder(dtype=tf.int32,
                                            shape=(None, self.A_len),
                                            name="candidates_bi")
        self.y_true = tf.placeholder(shape=(None, self.A_len),
                                     dtype=tf.float32,
                                     name="y_true")

        # 模型输入的长度,每个sample一个长度 shape=(None)
        d_lens = tf.reduce_sum(tf.sign(tf.abs(self.d_input)), 1)
        q_lens = tf.reduce_sum(tf.sign(tf.abs(self.q_input)), 1)

        with tf.variable_scope(
                'q_encoder',
                initializer=tf.contrib.layers.xavier_initializer()):
            # 问题的编码模型
            # output shape: (None, max_q_length, embedding_dim)
            q_embed = tf.nn.embedding_lookup(embedding, self.q_input)
            q_cell = MultiRNNCell(cells=[self.rnn_cell] * num_layers)
            outputs, last_states = tf.nn.bidirectional_dynamic_rnn(
                cell_bw=q_cell,
                cell_fw=q_cell,
                dtype="float32",
                sequence_length=q_lens,
                inputs=q_embed,
                swap_memory=True)
            # q_encoder output shape: (None, hidden_size * 2)
            q_encode = tf.concat([last_states[0][-1], last_states[1][-1]],
                                 axis=-1)
            logging.info("q_encode shape {}".format(q_encode.get_shape()))

        with tf.variable_scope(
                'd_encoder',
                initializer=tf.contrib.layers.xavier_initializer()):
            # 上下文文档的编码模型
            # output shape: (None, max_d_length, embedding_dim)
            d_embed = tf.nn.embedding_lookup(embedding, self.d_input)
            d_cell = MultiRNNCell(cells=[self.rnn_cell] * num_layers)
            outputs, last_states = tf.nn.bidirectional_dynamic_rnn(
                cell_bw=d_cell,
                cell_fw=d_cell,
                dtype="float32",
                sequence_length=d_lens,
                inputs=d_embed,
                swap_memory=True)
            # d_encoder output shape: (None, max_d_length, hidden_size * 2)
            d_encode = tf.concat(outputs, axis=-1)
            logging.info("d_encode shape {}".format(d_encode.get_shape()))

        def att_dot(x):
            """注意力点乘函数"""
            d_btf, q_bf = x
            res = K.batch_dot(tf.expand_dims(q_bf, -1), d_btf, (1, 2))
            return tf.reshape(res, [-1, self.d_len])

        with tf.variable_scope('merge'):
            mem_attention_pre_soft_bt = att_dot([d_encode, q_encode])
            mem_attention_pre_soft_masked_bt = tf.multiply(
                mem_attention_pre_soft_bt,
                self.context_mask_bt,
                name="attention_mask")
            mem_attention_bt = tf.nn.softmax(
                logits=mem_attention_pre_soft_masked_bt,
                name="softmax_attention")

        # 注意力求和,attention-sum过程
        def sum_prob_of_word(word_ix, sentence_ixs, sentence_attention_probs):
            word_ixs_in_sentence = tf.where(tf.equal(sentence_ixs, word_ix))
            return tf.reduce_sum(
                tf.gather(sentence_attention_probs, word_ixs_in_sentence))

        # noinspection PyUnusedLocal
        def sum_probs_single_sentence(prev, cur):
            candidate_indices_i, sentence_ixs_t, sentence_attention_probs_t = cur
            result = tf.scan(fn=lambda previous, x: sum_prob_of_word(
                x, sentence_ixs_t, sentence_attention_probs_t),
                             elems=[candidate_indices_i],
                             initializer=tf.constant(0., dtype="float32"))
            return result

        def sum_probs_batch(candidate_indices_bi, sentence_ixs_bt,
                            sentence_attention_probs_bt):
            result = tf.scan(fn=sum_probs_single_sentence,
                             elems=[
                                 candidate_indices_bi, sentence_ixs_bt,
                                 sentence_attention_probs_bt
                             ],
                             initializer=tf.Variable([0] * self.A_len,
                                                     dtype="float32"))
            return result

        # 注意力求和,output shape: (None, i) i = max_candidate_length = 10
        self.y_hat = sum_probs_batch(self.candidates_bi, self.d_input,
                                     mem_attention_bt)

        # 交叉熵损失函数
        output = self.y_hat / tf.reduce_sum(
            self.y_hat,
            reduction_indices=len(self.y_hat.get_shape()) - 1,
            keep_dims=True)
        # manual computation of crossentropy
        epsilon = tf.convert_to_tensor(_EPSILON,
                                       output.dtype.base_dtype,
                                       name="epsilon")
        output = tf.clip_by_value(output, epsilon, 1. - epsilon)
        self.loss = tf.reduce_mean(
            -tf.reduce_sum(self.y_true * tf.log(output),
                           reduction_indices=len(output.get_shape()) - 1))

        # 计算准确率
        self.correct_prediction = tf.reduce_sum(
            tf.sign(
                tf.cast(
                    tf.equal(tf.argmax(self.y_hat, 1),
                             tf.argmax(self.y_true, 1)), "float")))
        # 模型序列化工具
        self.saver = tf.train.Saver()
コード例 #21
0
  def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, stop_token_targets=None, global_step=None):

    with tf.variable_scope('inference') as scope:
      is_training = linear_targets is not None
      batch_size = tf.shape(inputs)[0]
      hp = self._hparams

      # Embed_depth = 512
      embedding_table = tf.get_variable(
        'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32,
        initializer=tf.truncated_normal_initializer(stddev=0.5))
      embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

      # Encoder编码器模块(prenet网络和cbhg网络)
      prenet_outputs = prenet(embedded_inputs, is_training, hp.prenet_depths)                       # prenet_depths = [256, 256]
      encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training, hp.encoder_depth)  # encoder_depth = 256

      # 位置敏感注意力机制(attention_depth = 128)
      attention_mechanism = LocationSensitiveAttention(hp.attention_depth, encoder_outputs)

      # 解码器RNN(两层残差门控循环单元,decoder_depth = 1024)
      multi_rnn_cell = MultiRNNCell([
          ResidualWrapper(GRUCell(hp.decoder_depth)),
          ResidualWrapper(GRUCell(hp.decoder_depth))
        ], state_is_tuple=True)

      # 帧投影层(80*5)
      frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step)

      # 停止层(包含停止符,5)
      stop_projection = StopProjection(is_training, shape=hp.outputs_per_step)

      # 解码器单元
      decoder_cell = TacotronDecoderWrapper(is_training, attention_mechanism, multi_rnn_cell, frame_projection, stop_projection)

      if is_training:  # 训练
        helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step, global_step)
      else:  # 使用停止符进行预测
        helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

      # 解码器初始化状态
      decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
      (decoder_outputs, stop_token_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
         CustomDecoder(decoder_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters)  # 80*5

      # 调整梅尔数组大小:从 80*5 到 80
      mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels])
      stop_token_outputs = tf.reshape(stop_token_outputs, [batch_size, -1])

      # 后处理网络(postnet_depth = 512)
      post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth)
      linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)  # num_freq = 2049

      # 从最终解码器状态获得对齐情况
      alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])

      self.inputs = inputs
      self.input_lengths = input_lengths
      self.mel_outputs = mel_outputs
      self.linear_outputs = linear_outputs
      self.stop_token_outputs = stop_token_outputs
      self.alignments = alignments
      self.mel_targets = mel_targets
      self.linear_targets = linear_targets
      self.stop_token_targets = stop_token_targets
コード例 #22
0
 def define_sequence_model(self):
     seed = 12345
     np.random.seed(12345)
     layer_list = []
     with self.graph.as_default() as g:
         utt_length = tf.placeholder(tf.int32, shape=(None))
         g.add_to_collection(name="utt_length", value=utt_length)
         with tf.name_scope("input"):
             input_layer = tf.placeholder(dtype=tf.float32,
                                          shape=(None, None, self.n_in),
                                          name="input_layer")
             if self.dropout_rate != 0.0:
                 print "Using dropout to avoid overfitting and the dropout rate is", self.dropout_rate
                 is_training_drop = tf.placeholder(dtype=tf.bool,
                                                   shape=(),
                                                   name="is_training_drop")
                 input_layer_drop = dropout(input_layer,
                                            self.dropout_rate,
                                            is_training=is_training_drop)
                 layer_list.append(input_layer_drop)
                 g.add_to_collection(name="is_training_drop",
                                     value=is_training_drop)
             else:
                 layer_list.append(input_layer)
         g.add_to_collection("input_layer", layer_list[0])
         with tf.name_scope("hidden_layer"):
             basic_cell = []
             if "tanh" in self.hidden_layer_type:
                 is_training_batch = tf.placeholder(
                     dtype=tf.bool, shape=(), name="is_training_batch")
                 bn_params = {
                     "is_training": is_training_batch,
                     "decay": 0.99,
                     "updates_collections": None
                 }
                 g.add_to_collection("is_training_batch", is_training_batch)
             for i in xrange(len(self.hidden_layer_type)):
                 if self.dropout_rate != 0.0:
                     if self.hidden_layer_type[i] == "tanh":
                         new_layer = fully_connected(
                             layer_list[-1],
                             self.hidden_layer_size[i],
                             activation_fn=tf.nn.tanh,
                             normalizer_fn=batch_norm,
                             normalizer_params=bn_params)
                         new_layer_drop = dropout(
                             new_layer,
                             self.dropout_rate,
                             is_training=is_training_drop)
                         layer_list.append(new_layer_drop)
                     if self.hidden_layer_type[i] == "lstm":
                         basic_cell.append(
                             MyDropoutWrapper(BasicLSTMCell(
                                 num_units=self.hidden_layer_size[i]),
                                              self.dropout_rate,
                                              self.dropout_rate,
                                              is_training=is_training_drop))
                     if self.hidden_layer_type[i] == "gru":
                         basic_cell.append(
                             MyDropoutWrapper(GRUCell(
                                 num_units=self.hidden_layer_size[i]),
                                              self.dropout_rate,
                                              self.dropout_rate,
                                              is_training=is_training_drop))
                 else:
                     if self.hidden_layer_type[i] == "tanh":
                         new_layer = fully_connected(
                             layer_list[-1],
                             self.hidden_layer_size[i],
                             activation_fn=tf.nn.tanh,
                             normalizer_fn=batch_norm,
                             normalizer_params=bn_params)
                         layer_list.append(new_layer)
                     if self.hidden_layer_type[i] == "lstm":
                         basic_cell.append(
                             LayerNormBasicLSTMCell(
                                 num_units=self.hidden_layer_size[i]))
                     if self.hidden_layer_type[i] == "gru":
                         basic_cell.append(
                             LayerNormGRUCell(
                                 num_units=self.hidden_layer_size[i]))
             multi_cell = MultiRNNCell(basic_cell)
             rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
                 multi_cell,
                 layer_list[-1],
                 dtype=tf.float32,
                 sequence_length=utt_length)
             layer_list.append(rnn_outputs)
         with tf.name_scope("output_layer"):
             if self.output_type == "linear":
                 output_layer = tf.layers.dense(rnn_outputs, self.n_out)
             #  stacked_rnn_outputs=tf.reshape(rnn_outputs,[-1,self.n_out])
             #  stacked_outputs=tf.layers.dense(stacked_rnn_outputs,self.n_out)
             #  output_layer=tf.reshape(stacked_outputs,[-1,utt_length,self.n_out])
             g.add_to_collection(name="output_layer", value=output_layer)
         with tf.name_scope("training_op"):
             if self.optimizer == "adam":
                 self.training_op = tf.train.AdamOptimizer()
コード例 #23
0
    def __init__(self,
                 num_items,
                 num_embed_units,
                 num_units,
                 num_layers,
                 vocab=None,
                 embed=None,
                 learning_rate=5e-4,
                 learning_rate_decay_factor=0.95,
                 max_gradient_norm=5.0,
                 use_lstm=True):

        self.epoch = tf.Variable(0, trainable=False, name='env/epoch')
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)

        self.sessions_input = tf.placeholder(tf.int32, shape=(None, None))
        self.rec_lists = tf.placeholder(tf.int32, shape=(None, None, None))
        self.rec_mask = tf.placeholder(tf.float32, shape=(None, None, None))
        self.aims_idx = tf.placeholder(tf.int32, shape=(None, None))
        self.sessions_length = tf.placeholder(tf.int32, shape=(None))
        self.purchase = tf.placeholder(tf.int32, shape=(None, None))

        if embed is None:
            self.embed = tf.get_variable(
                'env/embed', [num_items, num_embed_units],
                tf.float32,
                initializer=tf.initializers.truncated_normal(0, 1))
        else:
            self.embed = tf.get_variable('env/embed',
                                         dtype=tf.float32,
                                         initializer=embed)

        batch_size, encoder_length, rec_length = tf.shape(
            self.sessions_input)[0], tf.shape(
                self.sessions_input)[1], tf.shape(self.rec_lists)[2]

        encoder_mask = tf.reshape(
            tf.cumsum(tf.one_hot(self.sessions_length - 2, encoder_length),
                      reverse=True,
                      axis=1), [-1, encoder_length])

        self.encoder_input = tf.nn.embedding_lookup(
            self.embed, self.sessions_input)  #batch*len*unit
        self.aims = tf.one_hot(self.aims_idx, rec_length)
        if use_lstm:
            cell = MultiRNNCell(
                [LSTMCell(num_units) for _ in range(num_layers)])
        else:
            cell = MultiRNNCell(
                [GRUCell(num_units) for _ in range(num_layers)])

        # Training
        with tf.variable_scope("env"):
            # [batch_size, length, num_units]
            encoder_output, _ = dynamic_rnn(cell,
                                            self.encoder_input,
                                            self.sessions_length,
                                            dtype=tf.float32,
                                            scope="encoder")

            # [batch_size, length, embed_units]
            preference = tf.layers.dense(encoder_output,
                                         num_embed_units,
                                         name="pref_output")
            # [batch_size, length, rec_length, embed_units]
            self.candidate = tf.reshape(
                tf.gather_nd(self.embed, tf.expand_dims(self.rec_lists, 3)),
                [batch_size, encoder_length, rec_length, num_embed_units])

            # [batch_size, length, rec_length]
            logits = tf.reduce_mean(
                tf.multiply(tf.expand_dims(preference, 2), self.candidate), 3)
            mul_prob = tf.nn.softmax(logits) * self.rec_mask

            # [batch_size, length, rec_length]
            self.norm_prob = mul_prob / (
                tf.expand_dims(tf.reduce_sum(mul_prob, 2), 2) + 1e-20)
            # [batch_size, length, metric_num]
            _, self.argmax_index = tf.nn.top_k(self.norm_prob,
                                               k=FLAGS.metric + 1)
            local_predict_loss = tf.reduce_sum(
                -self.aims * tf.log(self.norm_prob + 1e-20), 2) * encoder_mask
            self.predict_loss = tf.reduce_sum(
                local_predict_loss) / tf.reduce_sum(encoder_mask)

            # [batch_size, length, embed_units]
            aim_embed = tf.reduce_sum(
                tf.expand_dims(self.aims, 3) * self.candidate, 2)
            # [batch_size, length, 2]
            self.purchase_prob = tf.nn.softmax(
                tf.layers.dense(tf.multiply(
                    tf.layers.dense(tf.stop_gradient(encoder_output),
                                    num_units,
                                    name="purchase_layer"),
                    tf.layers.dense(tf.stop_gradient(aim_embed),
                                    num_units,
                                    name="purchase_aim")),
                                2,
                                name="purchase_projection"))
            local_purchase_loss = tf.reduce_sum(
                -tf.one_hot(self.purchase, 2) *
                tf.log(self.purchase_prob + 1e-20), 2) * encoder_mask * tf.pow(
                    tf.cast(self.purchase, tf.float32) + 1, 5.3)
            self.purchase_loss = tf.reduce_sum(
                local_purchase_loss) / tf.reduce_sum(encoder_mask)
            self.decoder_loss = self.predict_loss + self.purchase_loss

            self.score = tf.placeholder(tf.float32, (None, None))
            self.score_loss = tf.reduce_sum(
                self.score *
                (local_predict_loss +
                 local_purchase_loss)) / tf.reduce_sum(encoder_mask)

        # Inference
        with tf.variable_scope("env", reuse=True):
            # tf.get_variable_scope().reuse_variables()
            # [batch_size, length, embed_units]
            inf_preference = tf.expand_dims(
                tf.layers.dense(encoder_output[:, -1, :],
                                num_embed_units,
                                name="pref_output"), 1)
            # [batch_size, 1, rec_length, embed_units]
            self.inf_candidate = tf.reshape(
                tf.gather_nd(self.embed, tf.expand_dims(self.rec_lists, 3)),
                [batch_size, 1, rec_length, num_embed_units])

            # [batch_size, 1, rec_length]
            inf_logits = tf.reduce_mean(
                tf.multiply(tf.expand_dims(inf_preference, 2),
                            self.inf_candidate), 3)
            inf_mul_prob = tf.nn.softmax(inf_logits) * self.rec_mask

            self.inf_norm_prob = inf_mul_prob / (
                tf.expand_dims(tf.reduce_sum(inf_mul_prob, 2), 2) + 1e-20)
            # [batch_size, 1, metric_num]
            _, self.inf_argmax_index = tf.nn.top_k(self.inf_norm_prob,
                                                   k=FLAGS.metric)

            def gumbel_max(inp, alpha, beta):
                # assert len(tf.shape(inp)) == 2
                g = tf.random_uniform(tf.shape(inp), 0.0001, 0.9999)
                g = -tf.log(-tf.log(g))
                inp_g = tf.nn.softmax(
                    (tf.nn.log_softmax(inp / 1.0) + g * alpha) * beta)
                return inp_g

            # [batch_size, action_num]
            _, self.inf_random_index = tf.nn.top_k(gumbel_max(
                tf.log(self.inf_norm_prob + 1e-12), 1, 1),
                                                   k=FLAGS.action_num)

            inf_aim_embed = tf.reduce_sum(
                tf.cast(
                    tf.reshape(
                        tf.one_hot(self.inf_argmax_index[:, :, 0], rec_length),
                        [batch_size, 1, rec_length, 1]), tf.float32) *
                self.inf_candidate, 2)

            # [batch_size, 1, 2]
            self.inf_purchase_prob = tf.nn.softmax(
                tf.layers.dense(tf.multiply(
                    tf.layers.dense(tf.stop_gradient(encoder_output),
                                    num_units,
                                    name="purchase_layer"),
                    tf.layers.dense(tf.stop_gradient(inf_aim_embed),
                                    num_units,
                                    name="purchase_aim")),
                                2,
                                name="purchase_projection"))

        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=tf.float32)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)

        self.global_step = tf.Variable(0, trainable=False)
        opt = tf.train.AdamOptimizer(self.learning_rate)
        self.params = tf.trainable_variables()

        # For pretraining
        gradients = tf.gradients(self.decoder_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        # For adversarial training
        score_gradients = tf.gradients(self.score_loss, self.params)
        score_clipped_gradients, self.score_gradient_norm = tf.clip_by_global_norm(
            score_gradients, max_gradient_norm)
        self.score_update = opt.apply_gradients(zip(score_clipped_gradients,
                                                    self.params),
                                                global_step=self.global_step)

        self.saver = tf.train.Saver(tf.global_variables(),
                                    write_version=tf.train.SaverDef.V2,
                                    max_to_keep=10,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)
コード例 #24
0
    def Tensor_Generate(self):
        #I think this line do not need anymore because TF 1.x does not support 16bit well.
        if pattern_Parameters.Pattern_Use_Bit == 16:
            float_Bit_Type = tf.float16
            int_Bit_Type = tf.int16
        elif pattern_Parameters.Pattern_Use_Bit == 32:
            float_Bit_Type = tf.float32
            int_Bit_Type = tf.int32
        else:
            assert False

        placeholder_Dict = self.pattern_Feeder.placeholder_Dict  #Placeholder is variable space. All patterns are inputted by placeholder

        with tf.variable_scope('EARS') as scope:  #Variable name managing.
            batch_Size = tf.shape(placeholder_Dict["Acoustic"])[
                0]  #Getting a batch size of current pattern

            input_Activation = placeholder_Dict[
                "Acoustic"]  #input is acoustic pattern
            conv_Parameters = enumerate(
                zip(model_Parameters.Prenet_Conv.Channels,
                    model_Parameters.Prenet_Conv.Kernel_Sizes,
                    model_Parameters.Prenet_Conv.Strides)
            )  #Getting convolution parameters from hyper parameters

            if model_Parameters.Prenet_Conv.Use:  #Prenet(Conv) is used only user set Conv.Use = True
                for conv_Index, (
                        channel, kernel_Size,
                        stride) in conv_Parameters:  #Conv layer count for loop
                    with tf.variable_scope(
                            'Prenet_Conv_{}'.format(conv_Index)):
                        input_Activation = tf.layers.conv1d(  #Calculating convolution
                            inputs=input_Activation,
                            filters=channel,
                            kernel_size=kernel_Size,
                            strides=stride,
                            padding='same',
                            activation=tf.nn.relu)
                        input_Activation = tf.layers.batch_normalization(  #Calculating batch normalization for regularization
                            inputs=input_Activation,
                            training=placeholder_Dict["Is_Training"])
                        if not model_Parameters.Prenet_Conv.Dropout_Rate is None:
                            input_Activation = tf.layers.dropout(  #Dropout applied for regularization
                                input_Activation,
                                rate=model_Parameters.Prenet_Conv.Dropout_Rate,
                                training=placeholder_Dict["Is_Training"])

            #This model use only training helper.(Ground truth)
            helper = TrainingHelper(  #Helper decides RNN calculation rule at each time step
                inputs=placeholder_Dict["Acoustic"],
                sequence_length=placeholder_Dict["Length"])

            #RNN. Model can select four types hidden.
            #Previous RNN state is for the no reset.
            if model_Parameters.Hidden_Type in ["LSTM", 'ZoneoutLSTM']:
                if model_Parameters.Hidden_Type == "LSTM":
                    rnn_Cell = LSTMCell(
                        model_Parameters.Hidden_Size)  #Setting LSTM Cell
                elif model_Parameters.Hidden_Type == "ZoneoutLSTM":
                    rnn_Cell = ZoneoutLSTMCell(  #Setting ZoneoutLSTMCell
                        num_units=model_Parameters.Hidden_Size,
                        is_training=placeholder_Dict["Is_Training"],
                        cell_zoneout_rate=model_Parameters.Zoneout_Rate,
                        output_zoneout_rate=model_Parameters.Zoneout_Rate)
                previous_RNN_State = tf.Variable(  #Stroage for RNN states. LSTM and ZoneoutLSTM need two states(c, h). Initially, they become zero vectors.
                    initial_value=LSTMStateTuple(
                        c=tf.zeros(shape=(model_Parameters.Batch_Size,
                                          model_Parameters.Hidden_Size)),
                        h=tf.zeros(shape=(model_Parameters.Batch_Size,
                                          model_Parameters.Hidden_Size))),
                    trainable=False,
                    dtype=float_Bit_Type)
                decoder_Initial_State = LSTMStateTuple(  #Setting the RNN states
                    c=previous_RNN_State[0][:batch_Size],
                    h=previous_RNN_State[1][:batch_Size])
            elif model_Parameters.Hidden_Type == "SCRN":
                rnn_Cell = SCRNCell(model_Parameters.Hidden_Size)
                previous_RNN_State = tf.Variable(  #Stroage for RNN states. SCRN needs two states(s, h). Initially, it becomes zero vectors.
                    initial_value=SCRNStateTuple(
                        s=tf.zeros(shape=(model_Parameters.Batch_Size,
                                          model_Parameters.Hidden_Size)),
                        h=tf.zeros(shape=(model_Parameters.Batch_Size,
                                          model_Parameters.Hidden_Size))),
                    trainable=False,
                    dtype=float_Bit_Type)
                decoder_Initial_State = SCRNStateTuple(  #Setting the RNN states
                    s=previous_RNN_State[0][:batch_Size],
                    h=previous_RNN_State[1][:batch_Size])
            elif model_Parameters.Hidden_Type in ["GRU", "BPTT"]:
                if model_Parameters.Hidden_Type == "GRU":
                    rnn_Cell = GRUCell(model_Parameters.Hidden_Size)
                elif model_Parameters.Hidden_Type == "BPTT":
                    rnn_Cell = BasicRNNCell(model_Parameters.Hidden_Size)
                previous_RNN_State = tf.Variable(  #Stroage for RNN states.
                    initial_value=tf.zeros(
                        shape=(model_Parameters.Batch_Size,
                               model_Parameters.Hidden_Size)),
                    trainable=False,
                    dtype=float_Bit_Type)
                decoder_Initial_State = previous_RNN_State[:
                                                           batch_Size]  #Setting the RNN states

            decoder = BasicDecoder(  #Decoder conduct RNN calculation by Helper's rule
                cell=rnn_Cell,
                helper=helper,
                initial_state=decoder_Initial_State)

            outputs, final_State, _ = dynamic_decode(  #Calculating hidden activation.
                decoder=decoder,
                output_time_major=False,
                impute_finished=True)

            hidden_Activation = outputs.rnn_output  #Getting hidden activation.

            #Semantic   (hidden_size -> semantic_size)
            semantic_Logits = tf.layers.dense(  #H->O calculation
                inputs=hidden_Activation,
                units=self.pattern_Feeder.semantic_Size,
                use_bias=True,
                name="semantic_Logits")

        #Back-prob.
        with tf.variable_scope('training_Loss') as scope:
            loss_Mask = tf.sequence_mask(
                placeholder_Dict["Length"], dtype=tf.float32
            )  #By the pattern length, zero padded location is masked. They cannot affect weight update.

            loss_Calculation = tf.nn.sigmoid_cross_entropy_with_logits(  #Calculation the error between target and output
                labels=placeholder_Dict["Semantic"],  #Target
                logits=semantic_Logits  #Output
            )
            loss_Calculation = tf.reduce_mean(loss_Calculation, axis=-1)
            loss_Calculation *= loss_Mask  #Masking

            loss = tf.reduce_sum(loss_Calculation)

            if model_Parameters.Weight_Regularization.Use:  #A method for regularization. If using weight regularization.
                loss += model_Parameters.Weight_Regularization.Rate * tf.reduce_sum(
                    [  #All values of each weights get small pressure for making they have same value.
                        tf.nn.l2_loss(variable)
                        for variable in tf.get_collection(
                            tf.GraphKeys.TRAINABLE_VARIABLES) if not any([
                                keyword.lower() in variable.name.lower()
                                for keyword in model_Parameters.
                                Weight_Regularization.Except_Keywords
                            ])
                    ])

            loss_Display = tf.reduce_sum(
                loss_Calculation, axis=0) / tf.math.count_nonzero(
                    loss_Calculation, axis=0, dtype=tf.float32
                )  #This is for the display. There is no meaning.

            global_Step = tf.Variable(
                0, name='global_Step', trainable=False
            )  #Global step means the trained batch, not epoch. This is used at learning rate decaying.

            ##Noam decay of learning rate
            step = tf.cast(global_Step + 1, dtype=float_Bit_Type)
            warmup_Steps = 4000.0
            learning_Rate = model_Parameters.Learning_Rate * warmup_Steps**0.5 * tf.minimum(
                step * warmup_Steps**-1.5, step**-0.5)

            #Static(Temp)
            #learning_Rate = tf.cast(model_Parameters.Learning_Rate, float_Bit_Type)

            #Weight update. We use the ADAM optimizer
            optimizer = tf.train.AdamOptimizer(
                learning_Rate)  #Generating ADAM optimizer
            gradients, variables = zip(*optimizer.compute_gradients(loss))
            clipped_Gradients, global_Norm = tf.clip_by_global_norm(
                gradients, 1.0
            )  #Suppressing the gradient to prevent explosion occurs by that too large a value is applied to the weight update.
            optimize = optimizer.apply_gradients(
                zip(clipped_Gradients,
                    variables), global_step=global_Step)  #Weight update

            #For no reset. Model save the rnn states.
            if model_Parameters.Hidden_Type in ["LSTM", 'ZoneoutLSTM']:
                rnn_State_Assign = tf.assign(
                    ref=previous_RNN_State,
                    value=LSTMStateTuple(c=tf.concat([
                        final_State[0][:batch_Size],
                        previous_RNN_State[0][batch_Size:]
                    ],
                                                     axis=0),
                                         h=tf.concat([
                                             final_State[1][:batch_Size],
                                             previous_RNN_State[1][batch_Size:]
                                         ],
                                                     axis=0)))
            if model_Parameters.Hidden_Type == "SCRN":
                rnn_State_Assign = tf.assign(
                    ref=previous_RNN_State,
                    value=SCRNStateTuple(s=tf.concat([
                        final_State[0][:batch_Size],
                        previous_RNN_State[0][batch_Size:]
                    ],
                                                     axis=0),
                                         h=tf.concat([
                                             final_State[1][:batch_Size],
                                             previous_RNN_State[1][batch_Size:]
                                         ],
                                                     axis=0)))
            elif model_Parameters.Hidden_Type in ["GRU", "BPTT"]:
                rnn_State_Assign = tf.assign(
                    ref=previous_RNN_State,
                    value=tf.concat([
                        final_State[:batch_Size],
                        previous_RNN_State[batch_Size:]
                    ],
                                    axis=0))

        with tf.variable_scope('test') as scope:
            #In test, if user want, previous hidden state will be zero. Thus, the saved values should be backup and become zero.
            if model_Parameters.Hidden_Type in ["LSTM", 'ZoneoutLSTM']:
                backup_RNN_State = tf.Variable(initial_value=LSTMStateTuple(
                    c=tf.zeros(shape=(model_Parameters.Batch_Size,
                                      model_Parameters.Hidden_Size)),
                    h=tf.zeros(shape=(model_Parameters.Batch_Size,
                                      model_Parameters.Hidden_Size))),
                                               trainable=False,
                                               dtype=float_Bit_Type)
            elif model_Parameters.Hidden_Type == "SCRN":
                backup_RNN_State = tf.Variable(initial_value=SCRNStateTuple(
                    s=tf.zeros(shape=(model_Parameters.Batch_Size,
                                      model_Parameters.Hidden_Size)),
                    h=tf.zeros(shape=(model_Parameters.Batch_Size,
                                      model_Parameters.Hidden_Size))),
                                               trainable=False,
                                               dtype=float_Bit_Type)
            elif model_Parameters.Hidden_Type in ["GRU", "BPTT"]:
                backup_RNN_State = tf.Variable(initial_value=tf.zeros(
                    shape=(model_Parameters.Batch_Size,
                           model_Parameters.Hidden_Size)),
                                               trainable=False,
                                               dtype=float_Bit_Type)

            backup_RNN_State_Assign = tf.assign(ref=backup_RNN_State,
                                                value=previous_RNN_State)
            with tf.control_dependencies([backup_RNN_State_Assign]):
                if model_Parameters.Hidden_Type in ["LSTM", 'ZoneoutLSTM']:
                    zero_RNN_State_Assign = tf.assign(
                        ref=previous_RNN_State,
                        value=LSTMStateTuple(
                            c=tf.zeros(shape=(model_Parameters.Batch_Size,
                                              model_Parameters.Hidden_Size),
                                       dtype=float_Bit_Type),
                            h=tf.zeros(shape=(model_Parameters.Batch_Size,
                                              model_Parameters.Hidden_Size),
                                       dtype=float_Bit_Type)))
                elif model_Parameters.Hidden_Type == "SCRN":
                    zero_RNN_State_Assign = tf.assign(
                        ref=previous_RNN_State,
                        value=LSTMStateTuple(
                            s=tf.zeros(shape=(model_Parameters.Batch_Size,
                                              model_Parameters.Hidden_Size),
                                       dtype=float_Bit_Type),
                            h=tf.zeros(shape=(model_Parameters.Batch_Size,
                                              model_Parameters.Hidden_Size),
                                       dtype=float_Bit_Type)))
                elif model_Parameters.Hidden_Type in ["GRU", "BPTT"]:
                    zero_RNN_State_Assign = tf.assign(
                        ref=previous_RNN_State,
                        value=tf.zeros(shape=(model_Parameters.Batch_Size,
                                              model_Parameters.Hidden_Size),
                                       dtype=float_Bit_Type))

            restore_RNN_State_Assign = tf.assign(ref=previous_RNN_State,
                                                 value=backup_RNN_State)

            semantic_Activation = tf.nn.sigmoid(semantic_Logits)

        self.training_Tensor_List = [
            global_Step, learning_Rate, loss_Display, optimize,
            rnn_State_Assign
        ]  #Setting return variables when training

        self.test_Mode_Turn_On_Tensor_List = [
            backup_RNN_State_Assign, zero_RNN_State_Assign
        ]  #Hidden state backup and all initial state become zero vectors.
        self.test_Mode_Turn_Off_Tensor_List = [restore_RNN_State_Assign
                                               ]  #Hidden state restore

        self.test_Tensor_List = [global_Step, semantic_Activation
                                 ]  #In test, we only need semantic activation

        self.hidden_Plot_Tensor_List = [
            tf.transpose(hidden_Activation, perm=[0, 2, 1])
        ]  #In hidden analysis, we only need hidden activation.

        self.tf_Session.run(
            tf.global_variables_initializer()
        )  #Initialize the weights. Until this code run, in Tensorflow, there is no weight.
コード例 #25
0
ファイル: gru_test1.py プロジェクト: yyx342779418/BookStudy
n_iterations = 10000
batch_size = 50

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])

# 现在在每个时间迭代,有一个大小为100的输出向量,但是实际上我们需要一个单独的输出值。
# 最简单的解决方案是将单元格包装在OutputProjectionWrapper中。
# cell = OutputProjectionWrapper(BasicRNNCell(num_units=n_neurous, activation=tf.nn.relu), output_size=n_outputs)

# 用技巧提高速度
# cell = BasicRNNCell(num_units=n_neurous, activation=tf.nn.relu)
# multi_layer_cell = MultiRNNCell([cell] * n_layers)
layers = [
    GRUCell(num_units=n_neurous, activation=tf.nn.relu)
    for _ in range(n_layers)
]
multi_layer_cell = MultiRNNCell(layers)
rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)
stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurous])
stacked_outputs = fully_connected(stacked_rnn_outputs,
                                  n_outputs,
                                  activation_fn=None)
outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])

loss = tf.reduce_mean(tf.square(outputs - y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
コード例 #26
0
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):
        is_training = linear_targets is not None
        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings
            char_embed_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            # [N, T_in, embedding_size]
            char_embedded_inputs = \
                    tf.nn.embedding_lookup(char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(speaker_id,
                                                   self.num_speakers,
                                                   hp.enc_prenet_sizes[-1],
                                                   "before_highway")
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [get_embed(
                                speaker_id, self.num_speakers,
                                hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \
                                        for idx in range(hp.dec_layer_num)]
                    else:
                        deep_dense = lambda x, dim: \
                                tf.layers.dense(x, dim, activation=tf.nn.softsign)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type))
            else:
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(char_embedded_inputs,
                                    is_training,
                                    hp.enc_prenet_sizes,
                                    hp.dropout_prob,
                                    scope='prenet')

            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            dec_prenet_outputs = DecoderPrenetWrapper(
                GRUCell(hp.attention_state_size), speaker_embed, is_training,
                hp.dec_prenet_sizes, hp.dropout_prob)

            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(hp.attention_size,
                                                        encoder_outputs,
                                                        normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs,
                                                     scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type.startswith('ntm2'):
                shift_width = int(hp.attention_type.split('-')[-1])
                attention_mechanism = NTMAttention2(hp.attention_size,
                                                    encoder_outputs,
                                                    shift_width=shift_width)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            attention_cell = AttentionWrapper(
                dec_prenet_outputs,
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell, embed_to_concat=speaker_embed)

            # Decoder (layers specified bottom to top):
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)]
            for _ in range(hp.dec_layer_num):
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied)
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(" [!] Shape {} and {} should be equal". \
                                format(shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.reduction_factor,
                                            rnn_decoder_test_mode)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(
                            BasicDecoder(output_cell, helper, decoder_init_state),
                            maximum_iterations=hp.max_iters)

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = \
                        tf.concat([tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
コード例 #27
0
tf.reset_default_graph()

x = np.random.randn(2, 4, 5)

print(x)

# x[1, 1:] = 0

print(x)

seq_lengths = [4, 4]

#分别建立一个lstm和gru的cell,比较输出的状态
cell = BasicLSTMCell(num_units=3, state_is_tuple=True)
gru = GRUCell(3)

outputs, last_states, = tf.nn.dynamic_rnn(cell,
                                          x,
                                          seq_lengths,
                                          dtype=tf.float64)

gruoutput, grulast_states = tf.nn.dynamic_rnn(gru,
                                              x,
                                              seq_lengths,
                                              dtype=tf.float64)

sess = tf.InteractiveSession()

sess.run(tf.global_variables_initializer())
コード例 #28
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   mel_lengths=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            if hp.use_vae:
                style_embeddings, mu, log_var = VAE(inputs=mel_targets,
                                                    input_lengths=mel_lengths,
                                                    filters=hp.filters,
                                                    kernel_size=(3, 3),
                                                    strides=(2, 2),
                                                    num_units=hp.vae_dim,
                                                    is_training=is_training,
                                                    scope='vae')

                self.mu = mu
                self.log_var = log_var
                style_embeddings = tf.layers.dense(style_embeddings,
                                                   hp.encoder_depth)
                style_embeddings = tf.expand_dims(style_embeddings, axis=1)
                style_embeddings = tf.tile(
                    style_embeddings,
                    [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 256]
                encoder_outputs = encoder_outputs + style_embeddings

            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                                  hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                mel_outputs,
                hp.num_mels,
                is_training,  # [N, T_out, postnet_depth=256]
                hp.postnet_depth)
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.mel_lengths = mel_lengths
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
コード例 #29
0
num_samples = tf.shape(inputs)[0]  # useful for later

# Embedding weights
We = np.random.randn(V, embedding_dim).astype(np.float32)

#Output params
Wo = init_weight(hidden_layer_size, K).astype(np.float32)
bo = np.zeros(K).astype(np.float32)

#Creating tensorflow variables
tfWe = tf.Variable(We)
tfWo = tf.Variable(Wo)
tfbo = tf.Variable(bo)

# Building the RNN unit - Using the GRU RNN
rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu)

# Outputs from Embedding Layer
x = tf.nn.embedding_lookup(tfWe, inputs)
x = tf.unstack(x, sequence_length, 1)

#Outputs from RNN Layer
outputs, states = get_rnn_output(rnn_unit, x, dtype=tf.float32)
outputs = tf.transpose(outputs, (1, 0, 2))
outputs = tf.reshape(
    outputs, (sequence_length * num_samples, hidden_layer_size))  # NT x M

# Building the dense layer
logits = tf.matmul(outputs, tfWo) + tfbo  # NT x K
predictions = tf.argmax(logits, 1)
predict_op = tf.reshape(predictions, (num_samples, sequence_length))
コード例 #30
0
def build_policy_raw_rnn(hyper_parms, batch_size):

    rnn_inputs = hyper_parms['model_input']
    policy_rnn_cell_num = hyper_parms['policy_rnn_cell_num']
    policy_rnn_type = hyper_parms['policy_rnn_type']
    sequence_length = hyper_parms['sequence_len']
    assigned_seg_act = hyper_parms['assigned_seg_act']
    policy_rnn_layer_num = hyper_parms['policy_rnn_layer_num']
    reproduce_policy = hyper_parms['reproduce_policy']
    greedy_policy = hyper_parms['greedy_policy']

    cells = []
    for _ in range(policy_rnn_layer_num):
        if policy_rnn_type == 'gru':
            rnn_cell = GRUCell(policy_rnn_cell_num)
        elif policy_rnn_type == 'lstm':
            #rnn_cell = LayerNormBasicLSTMCell(policy_rnn_cell_num)
            rnn_cell = LSTMCell(policy_rnn_cell_num)
        else:
            raise ValueError('RNN type should be LSTM or GRU')
        cells.append(rnn_cell)
    cell = tf.contrib.rnn.MultiRNNCell(cells)
    cell = PolicyRNNCell(cell)

    inputs = transpose_batch_time(rnn_inputs)
    inputs_ta = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
    inputs_ta = inputs_ta.unstack(inputs)

    seg_act = transpose_batch_time(assigned_seg_act)
    seg_act_ta = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
    seg_act_ta = seg_act_ta.unstack(seg_act)

    loop_state_ta = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)

    def loop_fn(time, cell_output, cell_state, loop_state):
        #check whether is initial condition
        if cell_output is None:  # time == 0
            next_cell_state = cell.zero_state(batch_size, tf.float32)
        else:
            next_cell_state = cell_state
        #check whether finished
        elements_finished = (time >= tf.cast(sequence_length, tf.int32))
        finished = tf.reduce_all(elements_finished)

        #decide action
        if cell_output is None:
            next_loop_state = loop_state_ta
        else:
            action = tf.cond(
                reproduce_policy, lambda: seg_act_ta.read(time), lambda: tf.
                multinomial(tf.log(cell_output), 1, output_dtype=tf.int32))

            action = tf.cond(
                greedy_policy, lambda: tf.expand_dims(
                    tf.argmax(cell_output, axis=1, output_type=tf.int32), 1),
                lambda: action)

        next_input = tf.cond(
            finished, lambda: tf.zeros(
                [batch_size, rnn_inputs.get_shape()[-1]], dtype=tf.float32),
            lambda: inputs_ta.read(time))

        emit_output = cell_output  # == None for time == 0

        #writing the action into loop state
        if cell_output == None:  # time == 0
            next_loop_state = loop_state_ta
        else:
            next_loop_state = loop_state.write(time - 1, action)

        return (elements_finished, next_input, next_cell_state, emit_output,
                next_loop_state)

    outputs_ta, _, loop_state_ta = tf.nn.raw_rnn(cell, loop_fn)
    outputs = convert_raw_rnn_ta_to_tensor(outputs_ta)
    sampled_actions = convert_raw_rnn_ta_to_tensor(loop_state_ta)

    return outputs, sampled_actions