Esempio n. 1
0
def train(argv=None):
    # load data
    print("Loading data ... ")
    x_train, y_train = dependency_load_data.load_train_data()
    x_test, y_test = dependency_load_data.load_test_data()

    # concatenate  and shuffle .
    x_sum = numpy.concatenate((x_train, x_test))
    y_sum = numpy.concatenate((y_train, y_test))
    numpy.random.seed(10)
    shuffle_indices = numpy.random.permutation(numpy.arange(len(y_sum)))
    x_shuffled = x_sum[shuffle_indices]
    y_shuffled = y_sum[shuffle_indices]

    # split to train and test .
    x_train = x_shuffled[1000:]
    y_train = y_shuffled[1000:]
    x_test = x_shuffled[:1000]
    y_test = y_shuffled[:1000]

    print(x_train.shape)
    print(x_test.shape)

    # expand (batch_size,MAX_SENTENCE_LENGTH,EMBEDDING_SIZE) to (batch_size,MAX_SENTENCE_LENGTH,EMBEDDING_SIZE,1)
    x_train = numpy.expand_dims(x_train, -1)
    x_test = numpy.expand_dims(x_test, -1)

    filter_sizes = [2, 3, 4, 5, 6]
    filter_numbers = [300, 200, 150, 100, 100]

    # input
    # input is sentence
    train_data_node = tf.placeholder(tf.float32,
                                     shape=(None, max_document_length,
                                            EMBEDDING_SIZE, NUM_CHANNELS))

    train_labels_node = tf.placeholder(tf.float32, shape=(None, NUM_CLASSES))

    dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

    # full connected - softmax layer,
    fc1_weights = tf.Variable(
        tf.truncated_normal([sum(filter_numbers), NUM_CLASSES],
                            stddev=0.1,
                            seed=SEED,
                            dtype=tf.float32))

    fc1_biases = tf.Variable(
        tf.constant(0.01, shape=[NUM_CLASSES], dtype=tf.float32))

    # model
    def model(data):
        pooled_outputs = []
        for idx, filter_size in enumerate(filter_sizes):
            conv = conv2d(train_data_node,
                          filter_numbers[idx],
                          filter_size,
                          EMBEDDING_SIZE,
                          name="kernel%d" % idx)
            # 1-max pooling,leave a tensor of shape[batch_size,1,1,num_filters]
            pool = tf.nn.max_pool(
                conv,
                ksize=[1, max_document_length - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID')
            pooled_outputs.append(tf.squeeze(pool))

        if len(filter_sizes) > 1:
            cnn_output = tf.concat(1, pooled_outputs)
        else:
            cnn_output = pooled_outputs[0]

        # add dropout
        reshape = tf.nn.dropout(cnn_output, dropout_keep_prob)
        # fc1 layer
        fc1_output = tf.matmul(reshape, fc1_weights) + fc1_biases
        return fc1_output

    # Training computation
    logits = model(train_data_node)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            tf.clip_by_value(logits, 1e-10, 1.0), train_labels_node))
    # L2 regularization for the fully connected parameters.
    regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases))
    loss += 0.1 * regularizers

    tf.scalar_summary('loss', loss)

    # optimizer
    global_step = tf.Variable(0, name="global_step", trainable=False)
    learning_rate = tf.Variable(start_learning_rate, name="learning_rate")
    # learning_rate=tf.train.exponential_decay(start_learning_rate,global_step*BATCH_SIZE,train_size,0.9,staircase=True)
    tf.scalar_summary('lr', learning_rate)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars,
                                         global_step=global_step)

    # Evaluate model
    train_predict = tf.argmax(logits, 1)
    train_label = tf.argmax(train_labels_node, 1)
    # train accuracy
    train_correct_pred = tf.equal(train_predict, train_label)
    train_accuracy = tf.reduce_mean(tf.cast(train_correct_pred, tf.float32))
    tf.scalar_summary('acc', train_accuracy)
    merged = tf.merge_all_summaries()

    def compute_index(y_label, y_predict):
        # macro
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "macro", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='macro'),
            f1_score(y_label, y_predict, average='macro')))
        # macro
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "micro", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='micro'),
            f1_score(y_label, y_predict, average='micro')))

        # weighted
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "weighted", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='weighted'),
            f1_score(y_label, y_predict, average='weighted')))

    def dev_step(x_batch, y_batch, best_test_loss, sess):
        feed_dict = {
            train_data_node: x_batch,
            train_labels_node: y_batch,
            dropout_keep_prob: 1.0
        }
        # Run the graph and fetch some of the nodes.
        # test dont apply train_op (train_op is update gradient).
        summary, step, losses, lr, acc, y_label, y_predict = sess.run(
            [
                merged, global_step, loss, learning_rate, train_accuracy,
                train_label, train_predict
            ],
            feed_dict=feed_dict)
        test_writer.add_summary(summary, step)
        time_str = datetime.datetime.now().isoformat()
        print("{}: step {}, loss {:g}, lr {:g} ,acc {:g}".format(
            time_str, step, losses, lr, acc))
        # print("{}: step {}, loss {:g} ,acc {:g}".format(time_str, step, losses,acc))
        # compute index
        compute_index(y_label, y_predict)

        new_best_test_loss = best_test_loss
        # decide if need to decay learning rate
        if (step % steps_each_check < 100) and (step > 100):
            loss_delta = (best_test_loss
                          if best_test_loss is not None else 0) - losses
            if best_test_loss is not None and loss_delta < decay_delta:
                print(
                    'validation loss did not improve enough, decay learning rate'
                )
                current_learning_rate = min_learning_rate if lr * learning_rate_decay < min_learning_rate else lr * learning_rate_decay
                if current_learning_rate == min_learning_rate:
                    print('It is already the smallest learning rate.')
                sess.run(learning_rate.assign(current_learning_rate))
                print('new learning rate is: ', current_learning_rate)
            else:
                # update
                new_best_test_loss = losses

        return new_best_test_loss

    # run the training
    with tf.Session() as sess:
        train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train',
                                              sess.graph)
        test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
        tf.initialize_all_variables().run()
        print('Initialized!')
        # Generate batches
        batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                          BATCH_SIZE, NUM_EPOCHS)
        # batch count
        batch_count = 0
        best_test_loss = None
        # Training loop.For each batch...
        for batch in batches:
            batch_count += 1
            if batch_count % EVAL_FREQUENCY == 0:
                print("\nEvaluation:")
                best_test_loss = dev_step(x_test, y_test, best_test_loss, sess)
                print("")
            else:
                if batch_count % META_FREQUENCY == 99:
                    x_batch, y_batch = zip(*batch)
                    feed_dict = {
                        train_data_node: x_batch,
                        train_labels_node: y_batch,
                        dropout_keep_prob: 0.5
                    }
                    # Run the graph and fetch some of the nodes.
                    # option
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    _, summary, step, losses, acc = sess.run(
                        [train_op, merged, global_step, loss, train_accuracy],
                        feed_dict=feed_dict,
                        options=run_options,
                        run_metadata=run_metadata)
                    train_writer.add_run_metadata(run_metadata,
                                                  'step%03d' % step)
                    train_writer.add_summary(summary, step)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g},acc {:g}".format(
                        time_str, step, losses, acc))
                else:
                    x_batch, y_batch = zip(*batch)
                    feed_dict = {
                        train_data_node: x_batch,
                        train_labels_node: y_batch,
                        dropout_keep_prob: 0.5
                    }
                    # Run the graph and fetch some of the nodes.
                    _, summary, step, losses, acc = sess.run(
                        [train_op, merged, global_step, loss, train_accuracy],
                        feed_dict=feed_dict)
                    train_writer.add_summary(summary, step)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, losses, acc))

        train_writer.close()
        test_writer.close()
Esempio n. 2
0
def train(argv=None):
    # load data
    print("Loading data ... ")
    x_train, y_train = dependency_load_data.load_train_data()
    x_test, y_test = dependency_load_data.load_test_data()

    # concatenate  and shuffle .
    x_sum = numpy.concatenate((x_train, x_test))
    y_sum = numpy.concatenate((y_train, y_test))
    numpy.random.seed(10)
    shuffle_indices = numpy.random.permutation(numpy.arange(len(y_sum)))
    x_shuffled = x_sum[shuffle_indices]
    y_shuffled = y_sum[shuffle_indices]

    # split to train and test .
    x_train = x_shuffled[1000:]
    y_train = y_shuffled[1000:]
    x_test = x_shuffled[:1000]
    y_test = y_shuffled[:1000]

    print(x_train.shape)
    print(x_test.shape)

    # input is sentence
    # [n,embed]
    train_data_node = tf.placeholder(tf.float32,
                                     shape=(max_document_length,
                                            EMBEDDING_SIZE))
    # [num_class]
    train_labels_node = tf.placeholder(tf.float32, shape=(NUM_CLASSES, ))

    dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

    # convolution weight
    wf_weights = tf.Variable(
        tf.truncated_normal([d_c, EMBEDDING_SIZE],
                            stddev=0.1,
                            seed=SEED,
                            dtype=tf.float32))

    wf_biases = tf.Variable(
        tf.constant(0.01, shape=[max_document_length], dtype=tf.float32))

    # attention matrix
    u_weights = tf.Variable(
        tf.truncated_normal([d_c, d_c],
                            stddev=0.1,
                            seed=SEED,
                            dtype=tf.float32))

    # class embeddings matrix
    classes_matrix = tf.Variable(
        tf.truncated_normal([d_c, NUM_CLASSES],
                            stddev=0.1,
                            seed=SEED,
                            dtype=tf.float32))

    # model
    # data = [max_document_length,EMBEDDING_SIZE]
    def model(data):
        # R = [d_c,n]
        R = tf.matmul(wf_weights, data, transpose_b=True)
        # convolution_output = [d_c,n]
        convolution_output = tf.tanh(tf.nn.bias_add(R, wf_biases))
        # attention
        G_part = tf.matmul(tf.transpose(convolution_output), u_weights)
        # correlation_matrix = [n,num_class]
        correlation_matrix = tf.matmul(G_part, classes_matrix)
        # apply softmax to get attention pooling matrix
        # attention_pool = [n,num_class]
        attention_pool = tf.nn.softmax(correlation_matrix, dim=0)
        # compute output
        # W = [d_c , num_class]
        W = tf.matmul(convolution_output, attention_pool)
        # output = [d_c]
        output = tf.reduce_max(W, reduction_indices=-1)
        return output

    # score all classes
    # w_o = [d_c]
    # classes_embeddings = [num_class,d_c]
    def score_classes(w_o, classes_embeddings):
        # classes_embeddings normalized
        normalized_classes_embeddings = tf.nn.l2_normalize(classes_embeddings,
                                                           dim=-1)
        all_class_embeddings = [
            tf.squeeze(one)
            for one in tf.split(0, NUM_CLASSES, normalized_classes_embeddings)
        ]
        scores = []
        normalized_w_o = tf.nn.l2_normalize(w_o, dim=-1)
        for class_embedding in all_class_embeddings:
            scores.append(tf.nn.l2_loss(normalized_w_o - class_embedding))
        # transform to tensor
        scores = tf.pack(scores)
        return scores

    # label = [num_class],int
    # scores = [num_class],float
    # neg score is the lowest score expect true score
    def get_predict_neg_score(scores, label):
        # dot product
        ground_index = tf.argmax(label, axis=0)
        ground_score = tf.reduce_sum(tf.mul(scores, tf.cast(label,
                                                            tf.float32)))
        # neg is the maximum of the remaining values
        reversed_scores = tf.negative(scores)
        top_values, top_indices = tf.nn.top_k(reversed_scores, k=2)
        true_flag = tf.nn.in_top_k(tf.expand_dims(reversed_scores, 0),
                                   tf.expand_dims(ground_index, 0), 1)
        top_1_index = tf.cast(true_flag, tf.int32)
        chosen_score = tf.negative(
            tf.squeeze(tf.gather(top_values, top_1_index)))
        return ground_score, chosen_score

    def get_true_predict_indice(scores, label):
        true_indices = tf.argmax(label, axis=0)
        top_value, top_indices = tf.nn.top_k(tf.negative(scores), k=1)
        predict_indices = tf.squeeze(tf.pack(top_indices))
        return true_indices, predict_indices

    # Training computation
    w_o = model(train_data_node)
    scores = score_classes(w_o, tf.transpose(classes_matrix))
    true_score, neg_score = get_predict_neg_score(scores, train_labels_node)
    true_index, predict_index = get_true_predict_indice(
        scores, train_labels_node)
    # loss
    loss = true_score + 1 - neg_score
    # L2 regularization for the fully connected parameters.
    regularizers = tf.nn.l2_loss(wf_weights) + tf.nn.l2_loss(
        wf_biases) + tf.nn.l2_loss(u_weights) + tf.nn.l2_loss(classes_matrix)
    loss += 0.01 * regularizers

    tf.scalar_summary('loss', loss)

    # optimizer
    global_step = tf.Variable(0, name="global_step", trainable=False)
    learning_rate = tf.Variable(start_learning_rate, name="learning_rate")
    # learning_rate=tf.train.exponential_decay(start_learning_rate,global_step*BATCH_SIZE,train_size,0.9,staircase=True)
    tf.scalar_summary('lr', learning_rate)
    # optimizer = tf.train.AdamOptimizer(learning_rate)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars,
                                         global_step=global_step)

    # Evaluate model , 0 is wrong, 1 is right
    # train_is_correct = tf.cast(tf.equal(true_index,predict_index),tf.float32)

    merged = tf.merge_all_summaries()

    def compute_index(y_label, y_predict):
        # macro
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "macro", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='macro'),
            f1_score(y_label, y_predict, average='macro')))
        # macro
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "micro", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='micro'),
            f1_score(y_label, y_predict, average='micro')))

        # weighted
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "weighted", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='weighted'),
            f1_score(y_label, y_predict, average='weighted')))

    def dev_step(x_batch, y_batch, best_test_loss, sess):
        test_size = len(x_batch)
        true_label = []
        predict_label = []
        test_loss = []
        current_step = 0
        current_lr = 0
        for i in range(test_size):
            one_feed_dict = {
                train_data_node: x_batch[i],
                train_labels_node: y_batch[i],
                dropout_keep_prob: 1.0
            }
            # Run the graph and fetch some of the nodes.
            # test dont apply train_op (train_op is update gradient).
            test_step, lr, result_loss, result_true, result_predict = sess.run(
                [global_step, learning_rate, loss, true_index, predict_index],
                feed_dict=one_feed_dict)
            true_label.append(result_true)
            predict_label.append(result_predict)
            test_loss.append(result_loss)
            # test_writer.add_summary(test_summary, test_step)
            current_step = test_step
            current_lr = lr

        # compute average loss
        average_loss = numpy.mean(test_loss)
        test_time_str = datetime.datetime.now().isoformat()
        print("{}: step {}, loss {:g}, lr {:g} ".format(
            test_time_str, current_step, average_loss, current_lr))
        # compute index
        compute_index(true_label, predict_label)

        new_best_test_loss = best_test_loss
        # decide if need to decay learning rate
        if (test_step % steps_each_check < 100) and (test_step > 100):
            loss_delta = (best_test_loss
                          if best_test_loss is not None else 0) - average_loss
            if best_test_loss is not None and loss_delta < decay_delta:
                print(
                    'validation loss did not improve enough, decay learning rate'
                )
                current_learning_rate = min_learning_rate if lr * learning_rate_decay < min_learning_rate else lr * learning_rate_decay
                if current_learning_rate == min_learning_rate:
                    print('It is already the smallest learning rate.')
                sess.run(learning_rate.assign(current_learning_rate))
                print('new learning rate is: ', current_learning_rate)
            else:
                # update
                new_best_test_loss = average_loss

        return new_best_test_loss

    # run the training
    with tf.Session() as sess:
        # train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train',sess.graph)
        # test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
        tf.initialize_all_variables().run()
        print('Initialized!')
        # Generate batches
        batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                          BATCH_SIZE, NUM_EPOCHS)
        # batch count
        batch_count = 0
        best_test_loss = None
        # Training loop.For each batch...
        for batch in batches:
            batch_count += 1
            if batch_count % EVAL_FREQUENCY == 0:
                print("\nEvaluation:")
                best_test_loss = dev_step(x_test, y_test, best_test_loss, sess)
                print("")
            else:
                if batch_count % META_FREQUENCY == 99:
                    x_batch, y_batch = zip(*batch)
                    train_size = len(x_batch)
                    true_label = []
                    predict_label = []
                    train_loss = []
                    # Run the graph and fetch some of the nodes.
                    # option
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    current_step = 0
                    for i in range(train_size):
                        feed_dict = {
                            train_data_node: x_batch[i],
                            train_labels_node: y_batch[i],
                            dropout_keep_prob: 0.5
                        }
                        _, step, result_loss, result_true, result_predict = sess.run(
                            [
                                train_op, global_step, loss, true_index,
                                predict_index
                            ],
                            feed_dict=feed_dict,
                            options=run_options,
                            run_metadata=run_metadata)
                        true_label.append(result_true)
                        predict_label.append(result_predict)
                        train_loss.append(result_loss)
                        current_step = step
                        # train_writer.add_run_metadata(run_metadata, 'step%03d' % step)
                        # train_writer.add_summary(summary, step)

                    # compute average loss
                    average_loss = numpy.mean(train_loss)
                    acc = accuracy_score(true_label, predict_label)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g} acc {:g}".format(
                        time_str, current_step, average_loss, acc))
                else:
                    x_batch, y_batch = zip(*batch)
                    train_size = len(x_batch)
                    true_label = []
                    predict_label = []
                    train_loss = []
                    current_step = 0
                    for i in range(train_size):
                        feed_dict = {
                            train_data_node: x_batch[i],
                            train_labels_node: y_batch[i],
                            dropout_keep_prob: 0.5
                        }
                        _, step, result_loss, result_true, result_predict = sess.run(
                            [
                                train_op, global_step, loss, true_index,
                                predict_index
                            ],
                            feed_dict=feed_dict)
                        true_label.append(result_true)
                        predict_label.append(result_predict)
                        train_loss.append(result_loss)
                        # train_writer.add_summary(summary, step)
                        current_step = step
                    average_loss = numpy.mean(train_loss)
                    acc = accuracy_score(true_label, predict_label)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g} acc {:g}".format(
                        time_str, current_step, average_loss, acc))
Esempio n. 3
0
def train(argv=None):
    # load data
    print("Loading data ... ")
    x_train, y_train = dependency_load_data.load_train_data()
    x_test, y_test = dependency_load_data.load_test_data()

    # concatenate  and shuffle .
    x_sum = numpy.concatenate((x_train, x_test))
    y_sum = numpy.concatenate((y_train, y_test))
    numpy.random.seed(10)
    shuffle_indices = numpy.random.permutation(numpy.arange(len(y_sum)))
    x_shuffled = x_sum[shuffle_indices]
    y_shuffled = y_sum[shuffle_indices]

    # split to train and test .
    # x=[N_Samples,max_document_length,EMBEDDING_SIZE]
    # y=[N_Samples,NUM_CLASSES]
    x_train = x_shuffled[Test_Size:]
    y_train = y_shuffled[Test_Size:]
    x_test = x_shuffled[:Test_Size]
    y_test = y_shuffled[:Test_Size]

    print(x_train.shape)
    print(x_test.shape)
    print("exception words : " +
          str(dependency_load_data.get_exception_number()))
    # 500
    steps_each_check = 500

    # input
    # input is sentence
    train_data_node = tf.placeholder(tf.float32,
                                     shape=(None, NUM_STEPS, EMBEDDING_SIZE))

    train_labels_node = tf.placeholder(tf.float32, shape=(None, NUM_CLASSES))

    dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
    is_training = tf.placeholder(tf.bool, name="is_training")

    # CNN
    filter_sizes = [2, 3, 4, 5, 6]
    filter_numbers = [300, 200, 150, 100, 100]

    # full connected - softmax layer,
    fc1_weights = tf.Variable(
        tf.truncated_normal([sum(filter_numbers), 100],
                            stddev=0.1,
                            seed=SEED,
                            dtype=tf.float32))

    fc1_biases = tf.Variable(tf.constant(0.01, shape=[100], dtype=tf.float32))

    fc2_weights = tf.Variable(
        tf.truncated_normal([100, NUM_CLASSES],
                            stddev=0.1,
                            seed=SEED,
                            dtype=tf.float32))

    fc2_biases = tf.Variable(
        tf.constant(0.01, shape=[NUM_CLASSES], dtype=tf.float32))

    # model
    def model(x):
        # Current data input shape: (batch_size, n_steps, n_input)
        x = tf.transpose(x, [1, 0, 2])
        # (n_steps*batch_size, n_input)
        x = tf.reshape(x, [-1, EMBEDDING_SIZE])
        #  get a list of 'n_steps' tensors of shape (batch_size, n_input)
        x = tf.split(0, NUM_STEPS, x)

        # B-directional LSTM
        with tf.variable_scope("fw_cell"):
            fw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden,
                                              forget_bias=1.0,
                                              state_is_tuple=True)
            # fw_cell = tf.nn.rnn_cell.DropoutWrapper(fw_cell, output_keep_prob=dropout_keep_prob)
            if rnn_layer > 1:
                fw_cell = tf.nn.rnn_cell.MultiRNNCell([fw_cell] * rnn_layer)

        with tf.variable_scope("bw_cell"):
            bw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden,
                                              forget_bias=1.0,
                                              state_is_tuple=True)
            # bw_cell = tf.nn.rnn_cell.DropoutWrapper(bw_cell, output_keep_prob=dropout_keep_prob)
            if rnn_layer > 1:
                bw_cell = tf.nn.rnn_cell.MultiRNNCell([bw_cell] * rnn_layer)

        # output = [batch_size,num_hidden*2]
        # outputs of Bi-directional LSTM to highway
        with tf.variable_scope("rnn_def"):
            outputs, fw_final_state, bw_final_state = tf.nn.bidirectional_rnn(
                fw_cell, bw_cell, x, dtype=tf.float32)

        # Highway
        # convert to [batch_size,num_steps,num_hidden*2]
        hw_input = tf.transpose(tf.pack(outputs, axis=0), [1, 0, 2])
        # convert to [batch_size x num_steps,num_hidden*2]
        hw_input = tf.reshape(hw_input, [-1, num_hidden * 2])
        size = hw_input.get_shape()[1]
        # size = num_hidden*2
        # tf.tanh
        # hw_output=[batch_size x num_steps,num_hidden*2]
        hw_output = highways(hw_input, size)

        # convert to [batch_size,num_steps,num_hidden*2]
        hw_output = tf.reshape(hw_output, [-1, NUM_STEPS, num_hidden * 2])
        # expand dim , cnn_input=[batch_size,num_steps,num_hidden*2,1]
        cnn_input = tf.expand_dims(hw_output, -1)

        # CNN
        pooled_outputs = []
        for idx, filter_size in enumerate(filter_sizes):
            conv = conv2d(cnn_input,
                          filter_numbers[idx],
                          filter_size,
                          num_hidden * 2,
                          name="kernel%d" % idx)
            # conv = batch_norm_conv2d(cnn_input,filter_numbers[idx], filter_size,idx,num_hidden*2,is_training,stddev=0.1, name="kernel%d" % idx)
            # 1-max pooling,leave a tensor of shape[batch_size,1,1,num_filters]
            pool = tf.nn.max_pool(
                conv,
                ksize=[1, max_document_length - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID')
            pooled_outputs.append(tf.squeeze(pool))

        if len(filter_sizes) > 1:
            cnn_output = tf.concat(1, pooled_outputs)
        else:
            cnn_output = pooled_outputs[0]

        # add dropout
        cnn_output = tf.nn.dropout(cnn_output, dropout_keep_prob)
        # fc1 layer
        hidden = tf.matmul(cnn_output, fc1_weights)
        # add batch normalization
        # hidden = official_batch_norm_layer(tf.nn.bias_add(hidden,fc1_biases),100,is_training,False,scope="fc1_batch_norm")
        fc1_output = tf.sigmoid(tf.nn.bias_add(hidden, fc1_biases))
        # softmax linear layer , don't apply activation function
        hidden = tf.matmul(fc1_output, fc2_weights)
        fc2_output = tf.nn.bias_add(hidden, fc2_biases)
        return fc2_output

    # Training computation
    # [batch_size,num_classes]
    logits = model(train_data_node)
    # add value clip to logits
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            tf.clip_by_value(logits, 1e-10, 1.0), train_labels_node))
    regularization = tf.nn.l2_loss(fc1_weights)+tf.nn.l2_loss(fc1_biases)+tf.nn.l2_loss(fc2_weights)\
                     + tf.nn.l2_loss(fc2_biases)
    loss += 0.01 * regularization

    tf.scalar_summary('loss', loss)

    # optimizer
    global_step = tf.Variable(0, name="global_step", trainable=False)
    # learning_rate=tf.train.exponential_decay(start_learning_rate,global_step,5000,0.5,staircase=True)
    learning_rate = tf.Variable(start_learning_rate, name="learning_rate")

    tf.scalar_summary('lr', learning_rate)

    # adamoptimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    # optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars,
                                         global_step=global_step)

    # Evaluate model
    train_predict = tf.argmax(logits, 1)
    train_label = tf.argmax(train_labels_node, 1)
    # train accuracy
    train_correct_pred = tf.equal(train_predict, train_label)
    train_accuracy = tf.reduce_mean(tf.cast(train_correct_pred, tf.float32))
    tf.scalar_summary('acc', train_accuracy)

    # all variables
    # for v in tf.all_variables():
    #    print(v.name)

    merged = tf.merge_all_summaries()

    def compute_index(y_label, y_predict):
        # macro
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "macro", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='macro'),
            f1_score(y_label, y_predict, average='macro')))
        # macro
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "micro", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='micro'),
            f1_score(y_label, y_predict, average='micro')))

        # weighted
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "weighted", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='weighted'),
            f1_score(y_label, y_predict, average='weighted')))

    def dev_step(x_batch, y_batch, best_test_loss, sess):
        feed_dict = {
            train_data_node: x_batch,
            train_labels_node: y_batch,
            dropout_keep_prob: 1.0,
            is_training: False
        }
        # Run the graph and fetch some of the nodes.
        # test dont apply train_op (train_op is update gradient).
        summary, step, losses, lr, acc, y_label, y_predict = sess.run(
            [
                merged, global_step, loss, learning_rate, train_accuracy,
                train_label, train_predict
            ],
            feed_dict=feed_dict)
        test_writer.add_summary(summary, step)
        time_str = datetime.datetime.now().isoformat()
        print("{}: step {}, loss {:g}, lr {:g} ,acc {:g}".format(
            time_str, step, losses, lr, acc))
        # print("{}: step {}, loss {:g} ,acc {:g}".format(time_str, step, losses,acc))
        # compute index
        compute_index(y_label, y_predict)

        new_best_test_loss = best_test_loss
        # decide if need to decay learning rate
        if (step % steps_each_check < 100) and (step > 100):
            loss_delta = (best_test_loss
                          if best_test_loss is not None else 0) - losses
            if best_test_loss is not None and loss_delta < decay_delta:
                print(
                    'validation loss did not improve enough, decay learning rate'
                )
                current_learning_rate = min_learning_rate if lr * learning_rate_decay < min_learning_rate else lr * learning_rate_decay
                if current_learning_rate == min_learning_rate:
                    print('It is already the smallest learning rate.')
                sess.run(learning_rate.assign(current_learning_rate))
                print('new learning rate is: ', current_learning_rate)
            else:
                # update
                new_best_test_loss = losses

        return new_best_test_loss

    # run the training
    with tf.Session() as sess:
        train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train',
                                              sess.graph)
        test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
        tf.initialize_all_variables().run()
        print('Initialized!')
        # Generate batches
        batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                          BATCH_SIZE, NUM_EPOCHS)
        # batch count
        batch_count = 0
        best_test_loss = None
        # Training loop.For each batch...
        for batch in batches:
            batch_count += 1
            if batch_count % EVAL_FREQUENCY == 0:
                print("\nEvaluation:")
                best_test_loss = dev_step(x_test, y_test, best_test_loss, sess)
                print("")
            else:
                if batch_count % META_FREQUENCY == 99:
                    x_batch, y_batch = zip(*batch)
                    feed_dict = {
                        train_data_node: x_batch,
                        train_labels_node: y_batch,
                        dropout_keep_prob: 0.5,
                        is_training: True
                    }
                    # Run the graph and fetch some of the nodes.
                    # option
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    _, summary, step, losses, acc = sess.run(
                        [train_op, merged, global_step, loss, train_accuracy],
                        feed_dict=feed_dict,
                        options=run_options,
                        run_metadata=run_metadata)
                    train_writer.add_run_metadata(run_metadata,
                                                  'step%03d' % step)
                    train_writer.add_summary(summary, step)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g},acc {:g}".format(
                        time_str, step, losses, acc))
                else:
                    x_batch, y_batch = zip(*batch)
                    feed_dict = {
                        train_data_node: x_batch,
                        train_labels_node: y_batch,
                        dropout_keep_prob: 0.5,
                        is_training: True
                    }
                    # Run the graph and fetch some of the nodes.
                    _, summary, step, losses, acc = sess.run(
                        [train_op, merged, global_step, loss, train_accuracy],
                        feed_dict=feed_dict)
                    train_writer.add_summary(summary, step)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, losses, acc))

        train_writer.close()
        test_writer.close()
def train(argv=None):

    # load data
    print("Loading data ... ")
    x_train, y_train = dependency_load_data.load_train_data()
    x_test, y_test = dependency_load_data.load_test_data()

    # concatenate  and shuffle .
    x_sum = numpy.concatenate((x_train, x_test))
    y_sum = numpy.concatenate((y_train, y_test))
    numpy.random.seed(10)
    shuffle_indices = numpy.random.permutation(numpy.arange(len(y_sum)))
    x_shuffled = x_sum[shuffle_indices]
    y_shuffled = y_sum[shuffle_indices]

    # split to train and test .
    # x=[N_Samples,max_document_length,EMBEDDING_SIZE]
    # y=[N_Samples,NUM_CLASSES]
    x_train = x_shuffled[Test_Size:]
    y_train = y_shuffled[Test_Size:]
    x_test = x_shuffled[:Test_Size]
    y_test = y_shuffled[:Test_Size]

    print(x_train.shape)
    print(x_test.shape)

    # input
    # input is sentence
    train_data_node = tf.placeholder(tf.float32,
                                     shape=(None, NUM_STEPS, EMBEDDING_SIZE))

    train_labels_node = tf.placeholder(tf.float32, shape=(None, NUM_CLASSES))

    dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
    is_training = tf.placeholder(tf.bool, name="is_training")

    fc1_weights = tf.Variable(
        tf.random_normal([2 * num_hidden, 200])
        # tf.truncated_normal([num_hidden,NUM_CLASSES],stddev=0.1,seed=SEED,dtype=tf.float32)
    )

    fc1_biases = tf.Variable(tf.constant(0.01, shape=[200], dtype=tf.float32))

    fc2_weights = tf.Variable(
        tf.random_normal([200, NUM_CLASSES])
        # tf.truncated_normal([num_hidden,NUM_CLASSES],stddev=0.1,seed=SEED,dtype=tf.float32)
    )

    fc2_biases = tf.Variable(
        tf.constant(0.01, shape=[NUM_CLASSES], dtype=tf.float32))

    # model
    def model(x):
        # Current data input shape: (batch_size, n_steps, n_input)
        x = tf.transpose(x, [1, 0, 2])
        # (n_steps*batch_size, n_input)
        x = tf.reshape(x, [-1, EMBEDDING_SIZE])
        #  get a list of 'n_steps' tensors of shape (batch_size, n_input)
        x = tf.split(0, NUM_STEPS, x)

        # B-directional LSTM
        fw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden,
                                          forget_bias=1.0,
                                          state_is_tuple=True)
        # add output projection
        # fw_cell = tf.nn.rnn_cell.OutputProjectionWrapper(fw_cell,output_projection_size)
        fw_cell = tf.nn.rnn_cell.DropoutWrapper(
            fw_cell, output_keep_prob=dropout_keep_prob)
        bw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden,
                                          forget_bias=1.0,
                                          state_is_tuple=True)
        # add output projection
        # bw_cell = tf.nn.rnn_cell.OutputProjectionWrapper(bw_cell,output_projection_size)
        bw_cell = tf.nn.rnn_cell.DropoutWrapper(
            bw_cell, output_keep_prob=dropout_keep_prob)

        if rnn_layer > 1:
            fw_cell = tf.nn.rnn_cell.MultiRNNCell([fw_cell] * rnn_layer)
            bw_cell = tf.nn.rnn_cell.MultiRNNCell([bw_cell] * rnn_layer)

        outputs, fw_final_state, bw_final_state = tf.nn.bidirectional_rnn(
            fw_cell, bw_cell, x, dtype=tf.float32)

        # initial_state = lstm_cell.zero_state(batch_size,dtype=tf.float32)
        # handle  all output
        # output = [batch_size,num_hidden*2]

        # add all output
        # merge_ouput = tf.matmul(tf.add_n(outputs), fc1_weights) + fc1_biases

        # dim-max
        dim_max = outputs[0]
        for output in outputs:
            dim_max = tf.maximum(dim_max, output)

        # fc1 layer
        hidden = tf.matmul(dim_max, fc1_weights) + fc1_biases
        # add batch normalization
        # hidden = official_batch_norm_layer(hidden,200,is_training,False,scope="fc1_batch_norm")
        fc1_output = tf.tanh(hidden)
        # fc2 layer
        merge_output = tf.matmul(fc1_output, fc2_weights) + fc2_biases
        # merge_output = [batch_size,num_classes]
        return merge_output

    # Training computation
    # [batch_size,num_classes]
    logits = model(train_data_node)
    # add value clip to logits
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            tf.clip_by_value(logits, 1e-10, 1.0), train_labels_node))
    # L2 regularization for the fully connected parameters.
    regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
                    tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))

    loss += 0.05 * regularizers

    tf.scalar_summary('loss', loss)

    # optimizer
    global_step = tf.Variable(0, name="global_step", trainable=False)
    learning_rate = tf.Variable(start_learning_rate, name="learning_rate")

    tf.scalar_summary('lr', learning_rate)

    # adamoptimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    # optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars,
                                         global_step=global_step)

    # Evaluate model
    train_predict = tf.argmax(logits, 1)
    train_label = tf.argmax(train_labels_node, 1)
    # train accuracy
    train_correct_pred = tf.equal(train_predict, train_label)
    train_accuracy = tf.reduce_mean(tf.cast(train_correct_pred, tf.float32))
    tf.scalar_summary('acc', train_accuracy)
    merged = tf.merge_all_summaries()

    def compute_index(y_label, y_predict):
        # macro
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "macro", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='macro'),
            f1_score(y_label, y_predict, average='macro')))
        # macro
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "micro", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='micro'),
            f1_score(y_label, y_predict, average='micro')))

        # weighted
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "weighted", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='weighted'),
            f1_score(y_label, y_predict, average='weighted')))

    def dev_step(x_batch, y_batch, best_test_loss, sess):
        feed_dict = {
            train_data_node: x_batch,
            train_labels_node: y_batch,
            dropout_keep_prob: 1.0,
            is_training: False
        }
        # Run the graph and fetch some of the nodes.
        # test dont apply train_op (train_op is update gradient).
        summary, step, losses, lr, acc, y_label, y_predict = sess.run(
            [
                merged, global_step, loss, learning_rate, train_accuracy,
                train_label, train_predict
            ],
            feed_dict=feed_dict)
        test_writer.add_summary(summary, step)
        time_str = datetime.datetime.now().isoformat()
        print("{}: step {}, loss {:g}, lr {:g} ,acc {:g}".format(
            time_str, step, losses, lr, acc))
        # print("{}: step {}, loss {:g} ,acc {:g}".format(time_str, step, losses,acc))
        # compute index
        compute_index(y_label, y_predict)

        new_best_test_loss = best_test_loss
        # decide if need to decay learning rate
        if (step % steps_each_check < 100) and (step > 100):
            loss_delta = (best_test_loss
                          if best_test_loss is not None else 0) - losses
            if best_test_loss is not None and loss_delta < decay_delta:
                print(
                    'validation loss did not improve enough, decay learning rate'
                )
                current_learning_rate = min_learning_rate if lr * learning_rate_decay < min_learning_rate else lr * learning_rate_decay
                if current_learning_rate == min_learning_rate:
                    print('It is already the smallest learning rate.')
                sess.run(learning_rate.assign(current_learning_rate))
                print('new learning rate is: ', current_learning_rate)
            else:
                # update
                new_best_test_loss = losses

        return new_best_test_loss

    # run the training
    with tf.Session() as sess:
        train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train',
                                              sess.graph)
        test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
        tf.initialize_all_variables().run()
        print('Initialized!')
        # Generate batches
        batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                          BATCH_SIZE, NUM_EPOCHS)
        # batch count
        batch_count = 0
        best_test_loss = None
        # Training loop.For each batch...
        for batch in batches:
            batch_count += 1
            if batch_count % EVAL_FREQUENCY == 0:
                print("\nEvaluation:")
                best_test_loss = dev_step(x_test, y_test, best_test_loss, sess)
                print("")
            else:
                if batch_count % META_FREQUENCY == 99:
                    x_batch, y_batch = zip(*batch)
                    feed_dict = {
                        train_data_node: x_batch,
                        train_labels_node: y_batch,
                        dropout_keep_prob: 0.4,
                        is_training: True
                    }
                    # Run the graph and fetch some of the nodes.
                    # option
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    _, summary, step, losses, acc = sess.run(
                        [train_op, merged, global_step, loss, train_accuracy],
                        feed_dict=feed_dict,
                        options=run_options,
                        run_metadata=run_metadata)
                    train_writer.add_run_metadata(run_metadata,
                                                  'step%03d' % step)
                    train_writer.add_summary(summary, step)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g},acc {:g}".format(
                        time_str, step, losses, acc))
                else:
                    x_batch, y_batch = zip(*batch)
                    feed_dict = {
                        train_data_node: x_batch,
                        train_labels_node: y_batch,
                        dropout_keep_prob: 0.4,
                        is_training: True
                    }
                    # Run the graph and fetch some of the nodes.
                    _, summary, step, losses, acc = sess.run(
                        [train_op, merged, global_step, loss, train_accuracy],
                        feed_dict=feed_dict)
                    train_writer.add_summary(summary, step)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, losses, acc))

        train_writer.close()
        test_writer.close()