def test_harnn():
    """Test HARNN model."""

    # Load data
    logger.info("✔︎ Loading data...")
    logger.info("Recommended padding Sequence length is: {0}".format(
        FLAGS.pad_seq_len))

    logger.info("✔︎ Test data processing...")
    test_data = dh.load_data_and_labels(FLAGS.test_data_file,
                                        FLAGS.num_classes_list,
                                        FLAGS.total_classes,
                                        FLAGS.embedding_dim,
                                        data_aug_flag=False)

    logger.info("✔︎ Test data padding...")
    x_test, y_test, y_test_tuple = dh.pad_data(test_data, FLAGS.pad_seq_len)
    y_test_labels = test_data.labels

    # Load harnn model
    BEST_OR_LATEST = input("☛ Load Best or Latest Model?(B/L): ")

    while not (BEST_OR_LATEST.isalpha()
               and BEST_OR_LATEST.upper() in ['B', 'L']):
        BEST_OR_LATEST = input(
            "✘ The format of your input is illegal, please re-input: ")
    if BEST_OR_LATEST.upper() == 'B':
        logger.info("✔︎ Loading best model...")
        checkpoint_file = cm.get_best_checkpoint(FLAGS.best_checkpoint_dir,
                                                 select_maximum_value=True)
    else:
        logger.info("✔︎ Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            input_y_first = graph.get_operation_by_name(
                "input_y_first").outputs[0]
            input_y_second = graph.get_operation_by_name(
                "input_y_second").outputs[0]
            input_y_third = graph.get_operation_by_name(
                "input_y_third").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            beta = graph.get_operation_by_name("beta").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            first_attention = graph.get_operation_by_name(
                "first-attention/attention").outputs[0]
            first_visual = graph.get_operation_by_name(
                "first-output/visual").outputs[0]
            second_visual = graph.get_operation_by_name(
                "second-output/visual").outputs[0]
            third_visual = graph.get_operation_by_name(
                "third-output/visual").outputs[0]
            scores = graph.get_operation_by_name("output/scores").outputs[0]
            loss = graph.get_operation_by_name("loss/loss").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = "first-output/scores|second-output/scores|third-output/scores|output/scores"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(
                sess, sess.graph_def, output_node_names.split("|"))
            tf.train.write_graph(output_graph_def,
                                 "graph",
                                 "graph-harnn-{0}.pb".format(MODEL),
                                 as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(
                zip(x_test, y_test, y_test_tuple, y_test_labels)),
                                    FLAGS.batch_size,
                                    1,
                                    shuffle=False)

            test_counter, test_loss = 0, 0.0

            # Collection
            true_labels = []
            predicted_labels = []
            predicted_scores = []

            # Collect for calculating metrics
            true_onehot_labels = []
            predicted_onehot_scores = []
            predicted_onehot_labels_ts = []
            predicted_onehot_labels_tk = [[] for _ in range(FLAGS.top_num)]

            for batch_test in batches:
                x_batch_test, y_batch_test, y_batch_test_tuple, y_batch_test_labels = zip(
                    *batch_test)

                y_batch_test_first = [i[0] for i in y_batch_test_tuple]
                y_batch_test_second = [j[1] for j in y_batch_test_tuple]
                y_batch_test_third = [k[2] for k in y_batch_test_tuple]

                feed_dict = {
                    input_x: x_batch_test,
                    input_y_first: y_batch_test_first,
                    input_y_second: y_batch_test_second,
                    input_y_third: y_batch_test_third,
                    input_y: y_batch_test,
                    dropout_keep_prob: 1.0,
                    beta: FLAGS.beta,
                    is_training: False
                }
                batch_first_attention, batch_first_visual, batch_second_visual, batch_third_visual, batch_scores, cur_loss = \
                    sess.run([first_attention, first_visual, second_visual, third_visual, scores, loss], feed_dict)

                # Prepare for calculating metrics
                for onehot_labels in y_batch_test:
                    true_onehot_labels.append(onehot_labels)

                for onehot_scores in batch_scores:
                    predicted_onehot_scores.append(onehot_scores)

                # Get the predicted labels by threshold
                batch_predicted_labels_ts, batch_predicted_scores_ts = \
                    dh.get_label_threshold(scores=batch_scores, threshold=FLAGS.threshold)

                # Add results to collection
                for labels in y_batch_test_labels:
                    true_labels.append(labels)
                for labels in batch_predicted_labels_ts:
                    predicted_labels.append(labels)
                for values in batch_predicted_scores_ts:
                    predicted_scores.append(values)

                # Get one-hot prediction by threshold
                batch_predicted_onehot_labels_ts = \
                    dh.get_onehot_label_threshold(scores=batch_scores, threshold=FLAGS.threshold)

                for onehot_labels in batch_predicted_onehot_labels_ts:
                    predicted_onehot_labels_ts.append(onehot_labels)

                # Get one-hot prediction by topK
                for i in range(FLAGS.top_num):
                    batch_predicted_onehot_labels_tk = dh.get_onehot_label_topk(
                        scores=batch_scores, top_num=i + 1)

                    for onehot_labels in batch_predicted_onehot_labels_tk:
                        predicted_onehot_labels_tk[i].append(onehot_labels)

                test_loss = test_loss + cur_loss
                test_counter = test_counter + 1

            # Calculate Precision & Recall & F1
            test_pre_ts = precision_score(
                y_true=np.array(true_onehot_labels),
                y_pred=np.array(predicted_onehot_labels_ts),
                average='micro')

            test_rec_ts = recall_score(
                y_true=np.array(true_onehot_labels),
                y_pred=np.array(predicted_onehot_labels_ts),
                average='micro')

            test_F_ts = f1_score(y_true=np.array(true_onehot_labels),
                                 y_pred=np.array(predicted_onehot_labels_ts),
                                 average='micro')

            # Calculate the average AUC
            test_auc = roc_auc_score(y_true=np.array(true_onehot_labels),
                                     y_score=np.array(predicted_onehot_scores),
                                     average='micro')

            # Calculate the average PR
            test_prc = average_precision_score(
                y_true=np.array(true_onehot_labels),
                y_score=np.array(predicted_onehot_scores),
                average="micro")

            test_loss = float(test_loss / test_counter)

            logger.info(
                "☛ All Test Dataset: Loss {0:g} | AUC {1:g} | AUPRC {2:g}".
                format(test_loss, test_auc, test_prc))
            # Predict by threshold
            logger.info(
                "☛ Predict by threshold: Precision {0:g}, Recall {1:g}, F1 {2:g}"
                .format(test_pre_ts, test_rec_ts, test_F_ts))

            # Save the prediction result
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR)
            dh.create_prediction_file(output_file=SAVE_DIR +
                                      "/predictions.json",
                                      data_id=test_data.patent_id,
                                      all_labels=true_labels,
                                      all_predict_labels=predicted_labels,
                                      all_predict_scores=predicted_scores)

    logger.info("✔︎ Done.")
def visualize():
    """Visualize HARNN model."""

    # Load word2vec model
    word2idx, embedding_matrix = dh.load_word2vec_matrix(args.word2vec_file)

    # Load data
    logger.info("Loading data...")
    logger.info("Data processing...")
    test_data = dh.load_data_and_labels(args, args.test_file, word2idx)

    # Load harnn model
    OPTION = dh._option(pattern=1)
    if OPTION == 'B':
        logger.info("Loading best model...")
        checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR,
                                                 select_maximum_value=True)
    else:
        logger.info("Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(CPT_DIR)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=args.allow_soft_placement,
            log_device_placement=args.log_device_placement)
        session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            input_y_first = graph.get_operation_by_name(
                "input_y_first").outputs[0]
            input_y_second = graph.get_operation_by_name(
                "input_y_second").outputs[0]
            input_y_third = graph.get_operation_by_name(
                "input_y_third").outputs[0]
            input_y_fourth = graph.get_operation_by_name(
                "input_y_fourth").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            alpha = graph.get_operation_by_name("alpha").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            first_visual = graph.get_operation_by_name(
                "first-output/visual").outputs[0]
            second_visual = graph.get_operation_by_name(
                "second-output/visual").outputs[0]
            third_visual = graph.get_operation_by_name(
                "third-output/visual").outputs[0]
            fourth_visual = graph.get_operation_by_name(
                "fourth-output/visual").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = "first-output/visual|second-output/visual|third-output/visual|fourth-output/visual|output/scores"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(
                sess, sess.graph_def, output_node_names.split("|"))
            tf.train.write_graph(output_graph_def,
                                 "graph",
                                 "graph-harnn-{0}.pb".format(MODEL),
                                 as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(create_input_data(test_data)),
                                    args.batch_size,
                                    1,
                                    shuffle=False)

            for batch_id, batch_test in enumerate(batches):
                x, x_content, sec, subsec, group, subgroup, y_onehot = zip(
                    *batch_test)

                feed_dict = {
                    input_x: x,
                    input_y_first: sec,
                    input_y_second: subsec,
                    input_y_third: group,
                    input_y_fourth: subgroup,
                    input_y: y_onehot,
                    dropout_keep_prob: 1.0,
                    alpha: args.alpha,
                    is_training: False
                }
                batch_first_visual, batch_second_visual, batch_third_visual, batch_fourth_visual = \
                    sess.run([first_visual, second_visual, third_visual, fourth_visual], feed_dict)

                batch_visual = [
                    batch_first_visual, batch_second_visual,
                    batch_third_visual, batch_fourth_visual
                ]

                seq_len = len(x_content[0])
                pad_len = len(batch_first_visual[0])
                length = (pad_len if seq_len >= pad_len else seq_len)
                visual_list = []

                for visual in batch_visual:
                    visual_list.append(
                        normalization(visual[0].tolist(), length))

                create_visual_file(batch_id, x_content, visual_list, seq_len)
    logger.info("Done.")
Beispiel #3
0
def test_cnn():
    """Test CNN model."""

    # Load data
    logger.info("✔︎ Loading data...")
    logger.info("Recommended padding Sequence length is: {0}".format(FLAGS.pad_seq_len))

    logger.info("✔︎ Test data processing...")
    test_data = dh.load_data_and_labels(FLAGS.test_data_file, FLAGS.embedding_dim)

    logger.info("✔︎ Test data padding...")
    x_test_front, x_test_behind, y_test = dh.pad_data(test_data, FLAGS.pad_seq_len)
    y_test_labels = test_data.labels

    # Load cnn model
    BEST_OR_LATEST = input("☛ Load Best or Latest Model?(B/L): ")

    while not (BEST_OR_LATEST.isalpha() and BEST_OR_LATEST.upper() in ['B', 'L']):
        BEST_OR_LATEST = input("✘ The format of your input is illegal, please re-input: ")
    if BEST_OR_LATEST == 'B':
        logger.info("✔︎ Loading best model...")
        checkpoint_file = cm.get_best_checkpoint(FLAGS.best_checkpoint_dir, select_maximum_value=True)
    else:
        logger.info("✔︎ Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x_front = graph.get_operation_by_name("input_x_front").outputs[0]
            input_x_behind = graph.get_operation_by_name("input_x_behind").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name("output/predictions").outputs[0]
            topKPreds = graph.get_operation_by_name("output/topKPreds").outputs[0]
            accuracy = graph.get_operation_by_name("accuracy/accuracy").outputs[0]
            loss = graph.get_operation_by_name("loss/loss").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = "output/logits|output/predictions|output/softmax_scores|output/topKPreds"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def,
                                                                            output_node_names.split("|"))
            tf.train.write_graph(output_graph_def, "graph", "graph-cnn-{0}.pb".format(MODEL), as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(zip(x_test_front, x_test_behind, y_test, y_test_labels)),
                                    FLAGS.batch_size, 1, shuffle=False)

            # Collect the predictions here
            all_labels = []
            all_predicted_labels = []
            all_predicted_values = []

            for index, x_test_batch in enumerate(batches):
                x_batch_front, x_batch_behind, y_batch, y_batch_labels = zip(*x_test_batch)
                feed_dict = {
                    input_x_front: x_batch_front,
                    input_x_behind: x_batch_behind,
                    input_y: y_batch,
                    dropout_keep_prob: 1.0,
                    is_training: False
                }

                all_labels = np.append(all_labels, y_batch_labels)

                batch_predicted_labels = sess.run(predictions, feed_dict)
                all_predicted_labels = np.concatenate([all_predicted_labels, batch_predicted_labels])

                batch_predicted_values = sess.run(topKPreds, feed_dict)
                all_predicted_values = np.append(all_predicted_values, batch_predicted_values)

                batch_loss = sess.run(loss, feed_dict)
                batch_acc = sess.run(accuracy, feed_dict)
                logger.info("✔︎ Test batch {0}: loss {1:g}, accuracy {2:g}.".format((index+1), batch_loss, batch_acc))

            # Save the prediction result
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR)
            dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", front_data_id=test_data.front_testid,
                                      behind_data_id=test_data.behind_testid, all_labels=all_labels,
                                      all_predict_labels=all_predicted_labels, all_predict_values=all_predicted_values)

    logger.info("✔︎ Done.")
def test_abcnn():
    """Test ABCNN model."""
    # Print parameters used for the model
    dh.tab_printer(args, logger)

    # Load word2vec model
    word2idx, embedding_matrix = dh.load_word2vec_matrix(args.word2vec_file)

    # Load data
    logger.info("Loading data...")
    logger.info("Data processing...")
    test_data = dh.load_data_and_labels(args, args.test_file, word2idx)

    # Load abcnn model
    OPTION = dh._option(pattern=1)
    if OPTION == 'B':
        logger.info("Loading best model...")
        checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR,
                                                 select_maximum_value=True)
    else:
        logger.info("Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(CPT_DIR)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=args.allow_soft_placement,
            log_device_placement=args.log_device_placement)
        session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x_front = graph.get_operation_by_name(
                "input_x_front").outputs[0]
            input_x_behind = graph.get_operation_by_name(
                "input_x_behind").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            scores = graph.get_operation_by_name("output/topKPreds").outputs[0]
            predictions = graph.get_operation_by_name(
                "output/topKPreds").outputs[1]
            loss = graph.get_operation_by_name("loss/loss").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = "output/topKPreds"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(
                sess, sess.graph_def, output_node_names.split("|"))
            tf.train.write_graph(output_graph_def,
                                 "graph",
                                 "graph-abcnn-{0}.pb".format(MODEL),
                                 as_text=False)

            # Generate batches for one epoch
            batches_test = dh.batch_iter(list(create_input_data(test_data)),
                                         args.batch_size,
                                         1,
                                         shuffle=False)

            # Collect the predictions here
            test_counter, test_loss = 0, 0.0
            true_labels = []
            predicted_labels = []
            predicted_scores = []

            for batch_test in batches_test:
                x_f, x_b, y_onehot = zip(*batch_test)
                feed_dict = {
                    input_x_front: x_f,
                    input_x_behind: x_b,
                    input_y: y_onehot,
                    dropout_keep_prob: 1.0,
                    is_training: False
                }

                batch_predicted_scores, batch_predicted_labels, batch_loss \
                    = sess.run([scores, predictions, loss], feed_dict)

                for i in y_onehot:
                    true_labels.append(np.argmax(i))
                for j in batch_predicted_scores:
                    predicted_scores.append(j[0])
                for k in batch_predicted_labels:
                    predicted_labels.append(k[0])

                test_loss = test_loss + batch_loss
                test_counter = test_counter + 1

            test_loss = float(test_loss / test_counter)

            # Calculate Precision & Recall & F1
            test_acc = accuracy_score(y_true=np.array(true_labels),
                                      y_pred=np.array(predicted_labels))
            test_pre = precision_score(y_true=np.array(true_labels),
                                       y_pred=np.array(predicted_labels),
                                       average='micro')
            test_rec = recall_score(y_true=np.array(true_labels),
                                    y_pred=np.array(predicted_labels),
                                    average='micro')
            test_F1 = f1_score(y_true=np.array(true_labels),
                               y_pred=np.array(predicted_labels),
                               average='micro')

            # Calculate the average AUC
            test_auc = roc_auc_score(y_true=np.array(true_labels),
                                     y_score=np.array(predicted_scores),
                                     average='micro')

            logger.info(
                "All Test Dataset: Loss {0:g} | Acc {1:g} | Precision {2:g} | "
                "Recall {3:g} | F1 {4:g} | AUC {5:g}".format(
                    test_loss, test_acc, test_pre, test_rec, test_F1,
                    test_auc))

            # Save the prediction result
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR)
            dh.create_prediction_file(output_file=SAVE_DIR +
                                      "/predictions.json",
                                      front_data_id=test_data['f_id'],
                                      behind_data_id=test_data['b_id'],
                                      true_labels=true_labels,
                                      predict_labels=predicted_labels,
                                      predict_scores=predicted_scores)

    logger.info("All Done.")
def train_rcnn():
    """Training RCNN model."""

    # Load sentences, labels, and training parameters
    logger.info("✔︎ Loading data...")

    logger.info("✔︎ Training data processing...")
    train_data = dh.load_data_and_labels(FLAGS.training_data_file,
                                         FLAGS.num_classes,
                                         FLAGS.embedding_dim,
                                         data_aug_flag=False)

    logger.info("✔︎ Validation data processing...")
    val_data = dh.load_data_and_labels(FLAGS.validation_data_file,
                                       FLAGS.num_classes,
                                       FLAGS.embedding_dim,
                                       data_aug_flag=False)

    logger.info("Recommended padding Sequence length is: {0}".format(
        FLAGS.pad_seq_len))

    logger.info("✔︎ Training data padding...")
    x_train, y_train = dh.pad_data(train_data, FLAGS.pad_seq_len)

    logger.info("✔︎ Validation data padding...")
    x_val, y_val = dh.pad_data(val_data, FLAGS.pad_seq_len)

    # Build vocabulary
    VOCAB_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(
        FLAGS.embedding_dim)

    # Build a graph and rcnn object
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            rcnn = TextRCNN(sequence_length=FLAGS.pad_seq_len,
                            num_classes=FLAGS.num_classes,
                            vocab_size=VOCAB_SIZE,
                            lstm_hidden_size=FLAGS.lstm_hidden_size,
                            fc_hidden_size=FLAGS.fc_hidden_size,
                            embedding_size=FLAGS.embedding_dim,
                            embedding_type=FLAGS.embedding_type,
                            filter_sizes=list(
                                map(int, FLAGS.filter_sizes.split(','))),
                            num_filters=FLAGS.num_filters,
                            l2_reg_lambda=FLAGS.l2_reg_lambda,
                            pretrained_embedding=pretrained_word2vec_matrix)

            # Define training procedure
            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                learning_rate = tf.train.exponential_decay(
                    learning_rate=FLAGS.learning_rate,
                    global_step=rcnn.global_step,
                    decay_steps=FLAGS.decay_steps,
                    decay_rate=FLAGS.decay_rate,
                    staircase=True)
                optimizer = tf.train.AdamOptimizer(learning_rate)
                grads, vars = zip(*optimizer.compute_gradients(rcnn.loss))
                grads, _ = tf.clip_by_global_norm(grads,
                                                  clip_norm=FLAGS.norm_ratio)
                train_op = optimizer.apply_gradients(
                    zip(grads, vars),
                    global_step=rcnn.global_step,
                    name="train_op")

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in zip(grads, vars):
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{0}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{0}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            if FLAGS.train_or_restore == 'R':
                MODEL = input(
                    "☛ Please input the checkpoints model you want to restore, "
                    "it should be like(1490175368): "
                )  # The model you want to restore

                while not (MODEL.isdigit() and len(MODEL) == 10):
                    MODEL = input(
                        "✘ The format of your input is illegal, please re-input: "
                    )
                logger.info(
                    "✔︎ The format of your input is legal, now loading to next step..."
                )
                out_dir = os.path.abspath(
                    os.path.join(os.path.curdir, "runs", MODEL))
                logger.info("✔︎ Writing to {0}\n".format(out_dir))
            else:
                timestamp = str(int(time.time()))
                out_dir = os.path.abspath(
                    os.path.join(os.path.curdir, "runs", timestamp))
                logger.info("✔︎ Writing to {0}\n".format(out_dir))

            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            best_checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "bestcheckpoints"))

            # Summaries for loss
            loss_summary = tf.summary.scalar("loss", rcnn.loss)

            # Train summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Validation summaries
            validation_summary_op = tf.summary.merge([loss_summary])
            validation_summary_dir = os.path.join(out_dir, "summaries",
                                                  "validation")
            validation_summary_writer = tf.summary.FileWriter(
                validation_summary_dir, sess.graph)

            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)
            best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir,
                                                num_to_keep=3,
                                                maximize=True)

            if FLAGS.train_or_restore == 'R':
                # Load rcnn model
                logger.info("✔︎ Loading model...")
                checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
                logger.info(checkpoint_file)

                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    "{0}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)
            else:
                if not os.path.exists(checkpoint_dir):
                    os.makedirs(checkpoint_dir)
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())

                # Embedding visualization config
                config = projector.ProjectorConfig()
                embedding_conf = config.embeddings.add()
                embedding_conf.tensor_name = "embedding"
                embedding_conf.metadata_path = FLAGS.metadata_file

                projector.visualize_embeddings(train_summary_writer, config)
                projector.visualize_embeddings(validation_summary_writer,
                                               config)

                # Save the embedding visualization
                saver.save(
                    sess, os.path.join(out_dir, "embedding", "embedding.ckpt"))

            current_step = sess.run(rcnn.global_step)

            def train_step(x_batch, y_batch):
                """A single training step"""
                feed_dict = {
                    rcnn.input_x: x_batch,
                    rcnn.input_y: y_batch,
                    rcnn.dropout_keep_prob: FLAGS.dropout_keep_prob,
                    rcnn.is_training: True
                }
                _, step, summaries, loss = sess.run(
                    [train_op, rcnn.global_step, train_summary_op, rcnn.loss],
                    feed_dict)
                logger.info("step {0}: loss {1:g}".format(step, loss))
                train_summary_writer.add_summary(summaries, step)

            def validation_step(x_val, y_val, writer=None):
                """Evaluates model on a validation set"""
                batches_validation = dh.batch_iter(list(zip(x_val, y_val)),
                                                   FLAGS.batch_size, 1)

                # Predict classes by threshold or topk ('ts': threshold; 'tk': topk)
                eval_counter, eval_loss = 0, 0.0

                eval_pre_tk = [0.0] * FLAGS.top_num
                eval_rec_tk = [0.0] * FLAGS.top_num
                eval_F_tk = [0.0] * FLAGS.top_num

                true_onehot_labels = []
                predicted_onehot_scores = []
                predicted_onehot_labels_ts = []
                predicted_onehot_labels_tk = [[] for _ in range(FLAGS.top_num)]

                for batch_validation in batches_validation:
                    x_batch_val, y_batch_val = zip(*batch_validation)
                    feed_dict = {
                        rcnn.input_x: x_batch_val,
                        rcnn.input_y: y_batch_val,
                        rcnn.dropout_keep_prob: 1.0,
                        rcnn.is_training: False
                    }
                    step, summaries, scores, cur_loss = sess.run([
                        rcnn.global_step, validation_summary_op, rcnn.scores,
                        rcnn.loss
                    ], feed_dict)

                    # Prepare for calculating metrics
                    for i in y_batch_val:
                        true_onehot_labels.append(i)
                    for j in scores:
                        predicted_onehot_scores.append(j)

                    # Predict by threshold
                    batch_predicted_onehot_labels_ts = \
                        dh.get_onehot_label_threshold(scores=scores, threshold=FLAGS.threshold)

                    for k in batch_predicted_onehot_labels_ts:
                        predicted_onehot_labels_ts.append(k)

                    # Predict by topK
                    for top_num in range(FLAGS.top_num):
                        batch_predicted_onehot_labels_tk = dh.get_onehot_label_topk(
                            scores=scores, top_num=top_num + 1)

                        for i in batch_predicted_onehot_labels_tk:
                            predicted_onehot_labels_tk[top_num].append(i)

                    eval_loss = eval_loss + cur_loss
                    eval_counter = eval_counter + 1

                    if writer:
                        writer.add_summary(summaries, step)

                eval_loss = float(eval_loss / eval_counter)

                # Calculate Precision & Recall & F1 (threshold & topK)
                eval_pre_ts = precision_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_ts),
                    average='micro')
                eval_rec_ts = recall_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_ts),
                    average='micro')
                eval_F_ts = f1_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_ts),
                    average='micro')

                for top_num in range(FLAGS.top_num):
                    eval_pre_tk[top_num] = precision_score(
                        y_true=np.array(true_onehot_labels),
                        y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                        average='micro')
                    eval_rec_tk[top_num] = recall_score(
                        y_true=np.array(true_onehot_labels),
                        y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                        average='micro')
                    eval_F_tk[top_num] = f1_score(
                        y_true=np.array(true_onehot_labels),
                        y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                        average='micro')

                # Calculate the average AUC
                eval_auc = roc_auc_score(
                    y_true=np.array(true_onehot_labels),
                    y_score=np.array(predicted_onehot_scores),
                    average='micro')
                # Calculate the average PR
                eval_prc = average_precision_score(
                    y_true=np.array(true_onehot_labels),
                    y_score=np.array(predicted_onehot_scores),
                    average='micro')

                return eval_loss, eval_auc, eval_prc, eval_rec_ts, eval_pre_ts, eval_F_ts, \
                       eval_rec_tk, eval_pre_tk, eval_F_tk

            # Generate batches
            batches_train = dh.batch_iter(list(zip(x_train, y_train)),
                                          FLAGS.batch_size, FLAGS.num_epochs)

            num_batches_per_epoch = int(
                (len(x_train) - 1) / FLAGS.batch_size) + 1

            # Training loop. For each batch...
            for batch_train in batches_train:
                x_batch_train, y_batch_train = zip(*batch_train)
                train_step(x_batch_train, y_batch_train)
                current_step = tf.train.global_step(sess, rcnn.global_step)

                if current_step % FLAGS.evaluate_every == 0:
                    logger.info("\nEvaluation:")
                    eval_loss, eval_auc, eval_prc, \
                    eval_rec_ts, eval_pre_ts, eval_F_ts, eval_rec_tk, eval_pre_tk, eval_F_tk = \
                        validation_step(x_val, y_val, writer=validation_summary_writer)

                    logger.info(
                        "All Validation set: Loss {0:g} | AUC {1:g} | AUPRC {2:g}"
                        .format(eval_loss, eval_auc, eval_prc))

                    # Predict by threshold
                    logger.info(
                        "☛ Predict by threshold: Precision {0:g}, Recall {1:g}, F {2:g}"
                        .format(eval_pre_ts, eval_rec_ts, eval_F_ts))

                    # Predict by topK
                    logger.info("☛ Predict by topK:")
                    for top_num in range(FLAGS.top_num):
                        logger.info(
                            "Top{0}: Precision {1:g}, Recall {2:g}, F {3:g}".
                            format(top_num + 1, eval_pre_tk[top_num],
                                   eval_rec_tk[top_num], eval_F_tk[top_num]))
                    best_saver.handle(eval_prc, sess, current_step)
                if current_step % FLAGS.checkpoint_every == 0:
                    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    logger.info(
                        "✔︎ Saved model checkpoint to {0}\n".format(path))
                if current_step % num_batches_per_epoch == 0:
                    current_epoch = current_step // num_batches_per_epoch
                    logger.info(
                        "✔︎ Epoch {0} has finished!".format(current_epoch))

    logger.info("✔︎ Done.")
Beispiel #6
0
def test_harnn():
    """Test HARNN model."""
    # Print parameters used for the model
    dh.tab_printer(args, logger)

    # Load data
    logger.info("Loading data...")
    logger.info("Data processing...")
    test_data = dh.load_data_and_labels(args.test_file,
                                        args.num_classes_list,
                                        args.total_classes,
                                        args.word2vec_file,
                                        data_aug_flag=False)

    logger.info("Data padding...")
    x_test, y_test, y_test_tuple = dh.pad_data(test_data, args.pad_seq_len)
    y_test_labels = test_data.labels

    # Load harnn model
    OPTION = dh._option(pattern=1)
    if OPTION == 'B':
        logger.info("Loading best model...")
        checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR,
                                                 select_maximum_value=True)
    else:
        logger.info("Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(CPT_DIR)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=args.allow_soft_placement,
            log_device_placement=args.log_device_placement)
        session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            input_y_first = graph.get_operation_by_name(
                "input_y_first").outputs[0]
            input_y_second = graph.get_operation_by_name(
                "input_y_second").outputs[0]
            input_y_third = graph.get_operation_by_name(
                "input_y_third").outputs[0]
            input_y_fourth = graph.get_operation_by_name(
                "input_y_fourth").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            alpha = graph.get_operation_by_name("alpha").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            first_scores = graph.get_operation_by_name(
                "first-output/scores").outputs[0]
            second_scores = graph.get_operation_by_name(
                "second-output/scores").outputs[0]
            third_scores = graph.get_operation_by_name(
                "third-output/scores").outputs[0]
            fourth_scores = graph.get_operation_by_name(
                "fourth-output/scores").outputs[0]
            scores = graph.get_operation_by_name("output/scores").outputs[0]
            loss = graph.get_operation_by_name("loss/loss").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = "first-output/scores|second-output/scores|third-output/scores|fourth-output/scores|output/scores"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(
                sess, sess.graph_def, output_node_names.split("|"))
            tf.train.write_graph(output_graph_def,
                                 "graph",
                                 "graph-harnn-{0}.pb".format(MODEL),
                                 as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(
                zip(x_test, y_test, y_test_tuple, y_test_labels)),
                                    args.batch_size,
                                    1,
                                    shuffle=False)

            test_counter, test_loss = 0, 0.0

            # Collect the predictions here
            true_labels = []
            predicted_labels = []
            predicted_scores = []

            # Collect for calculating metrics
            true_onehot_labels = []
            predicted_onehot_scores = []
            predicted_onehot_labels_ts = []
            predicted_onehot_labels_tk = [[] for _ in range(args.topK)]

            true_onehot_first_labels = []
            true_onehot_second_labels = []
            true_onehot_third_labels = []
            true_onehot_fourth_labels = []
            predicted_onehot_scores_first = []
            predicted_onehot_scores_second = []
            predicted_onehot_scores_third = []
            predicted_onehot_scores_fourth = []
            predicted_onehot_labels_first = []
            predicted_onehot_labels_second = []
            predicted_onehot_labels_third = []
            predicted_onehot_labels_fourth = []

            for batch_test in batches:
                x_batch_test, y_batch_test, y_batch_test_tuple, y_batch_test_labels = zip(
                    *batch_test)

                y_batch_test_first = [i[0] for i in y_batch_test_tuple]
                y_batch_test_second = [j[1] for j in y_batch_test_tuple]
                y_batch_test_third = [k[2] for k in y_batch_test_tuple]
                y_batch_test_fourth = [t[3] for t in y_batch_test_tuple]

                feed_dict = {
                    input_x: x_batch_test,
                    input_y_first: y_batch_test_first,
                    input_y_second: y_batch_test_second,
                    input_y_third: y_batch_test_third,
                    input_y_fourth: y_batch_test_fourth,
                    input_y: y_batch_test,
                    dropout_keep_prob: 1.0,
                    alpha: args.alpha,
                    is_training: False
                }
                batch_first_scores, batch_second_scores, batch_third_scores, batch_fourth_scores, batch_scores, cur_loss = \
                    sess.run([first_scores, second_scores, third_scores, fourth_scores, scores, loss], feed_dict)

                # Prepare for calculating metrics
                for onehot_labels in y_batch_test:
                    true_onehot_labels.append(onehot_labels)
                for onehot_labels in y_batch_test_first:
                    true_onehot_first_labels.append(onehot_labels)
                for onehot_labels in y_batch_test_second:
                    true_onehot_second_labels.append(onehot_labels)
                for onehot_labels in y_batch_test_third:
                    true_onehot_third_labels.append(onehot_labels)
                for onehot_labels in y_batch_test_fourth:
                    true_onehot_fourth_labels.append(onehot_labels)

                for onehot_scores in batch_scores:
                    predicted_onehot_scores.append(onehot_scores)
                for onehot_scores in batch_first_scores:
                    predicted_onehot_scores_first.append(onehot_scores)
                for onehot_scores in batch_second_scores:
                    predicted_onehot_scores_second.append(onehot_scores)
                for onehot_scores in batch_third_scores:
                    predicted_onehot_scores_third.append(onehot_scores)
                for onehot_scores in batch_fourth_scores:
                    predicted_onehot_scores_fourth.append(onehot_scores)

                # Get the predicted labels by threshold
                batch_predicted_labels_ts, batch_predicted_scores_ts = \
                    dh.get_label_threshold(scores=batch_scores, threshold=args.threshold)

                # Add results to collection
                for labels in y_batch_test_labels:
                    true_labels.append(labels)
                for labels in batch_predicted_labels_ts:
                    predicted_labels.append(labels)
                for values in batch_predicted_scores_ts:
                    predicted_scores.append(values)

                # Get one-hot prediction by threshold
                batch_predicted_onehot_labels_ts = \
                    dh.get_onehot_label_threshold(scores=batch_scores, threshold=args.threshold)
                batch_predicted_onehot_labels_first = \
                    dh.get_onehot_label_threshold(scores=batch_first_scores, threshold=args.threshold)
                batch_predicted_onehot_labels_second = \
                    dh.get_onehot_label_threshold(scores=batch_second_scores, threshold=args.threshold)
                batch_predicted_onehot_labels_third = \
                    dh.get_onehot_label_threshold(scores=batch_third_scores, threshold=args.threshold)
                batch_predicted_onehot_labels_fourth = \
                    dh.get_onehot_label_threshold(scores=batch_fourth_scores, threshold=args.threshold)

                for onehot_labels in batch_predicted_onehot_labels_ts:
                    predicted_onehot_labels_ts.append(onehot_labels)
                for onehot_labels in batch_predicted_onehot_labels_first:
                    predicted_onehot_labels_first.append(onehot_labels)
                for onehot_labels in batch_predicted_onehot_labels_second:
                    predicted_onehot_labels_second.append(onehot_labels)
                for onehot_labels in batch_predicted_onehot_labels_third:
                    predicted_onehot_labels_third.append(onehot_labels)
                for onehot_labels in batch_predicted_onehot_labels_fourth:
                    predicted_onehot_labels_fourth.append(onehot_labels)

                # Get one-hot prediction by topK
                for i in range(args.topK):
                    batch_predicted_onehot_labels_tk = dh.get_onehot_label_topk(
                        scores=batch_scores, top_num=i + 1)

                    for onehot_labels in batch_predicted_onehot_labels_tk:
                        predicted_onehot_labels_tk[i].append(onehot_labels)

                test_loss = test_loss + cur_loss
                test_counter = test_counter + 1

            # Calculate Precision & Recall & F1
            test_pre_ts = precision_score(
                y_true=np.array(true_onehot_labels),
                y_pred=np.array(predicted_onehot_labels_ts),
                average='micro')

            test_pre_first = precision_score(
                y_true=np.array(true_onehot_first_labels),
                y_pred=np.array(predicted_onehot_labels_first),
                average='micro')
            test_pre_second = precision_score(
                y_true=np.array(true_onehot_second_labels),
                y_pred=np.array(predicted_onehot_labels_second),
                average='micro')
            test_pre_third = precision_score(
                y_true=np.array(true_onehot_third_labels),
                y_pred=np.array(predicted_onehot_labels_third),
                average='micro')
            test_pre_fourth = precision_score(
                y_true=np.array(true_onehot_fourth_labels),
                y_pred=np.array(predicted_onehot_labels_fourth),
                average='micro')

            test_rec_ts = recall_score(
                y_true=np.array(true_onehot_labels),
                y_pred=np.array(predicted_onehot_labels_ts),
                average='micro')

            test_rec_first = recall_score(
                y_true=np.array(true_onehot_first_labels),
                y_pred=np.array(predicted_onehot_labels_first),
                average='micro')
            test_rec_second = recall_score(
                y_true=np.array(true_onehot_second_labels),
                y_pred=np.array(predicted_onehot_labels_second),
                average='micro')
            test_rec_third = recall_score(
                y_true=np.array(true_onehot_third_labels),
                y_pred=np.array(predicted_onehot_labels_third),
                average='micro')
            test_rec_fourth = recall_score(
                y_true=np.array(true_onehot_fourth_labels),
                y_pred=np.array(predicted_onehot_labels_fourth),
                average='micro')

            test_F1_ts = f1_score(y_true=np.array(true_onehot_labels),
                                  y_pred=np.array(predicted_onehot_labels_ts),
                                  average='micro')

            test_F1_first = f1_score(
                y_true=np.array(true_onehot_first_labels),
                y_pred=np.array(predicted_onehot_labels_first),
                average='micro')
            test_F1_second = f1_score(
                y_true=np.array(true_onehot_second_labels),
                y_pred=np.array(predicted_onehot_labels_second),
                average='micro')
            test_F1_third = f1_score(
                y_true=np.array(true_onehot_third_labels),
                y_pred=np.array(predicted_onehot_labels_third),
                average='micro')
            test_F1_fourth = f1_score(
                y_true=np.array(true_onehot_fourth_labels),
                y_pred=np.array(predicted_onehot_labels_fourth),
                average='micro')

            # Calculate the average AUC
            test_auc = roc_auc_score(y_true=np.array(true_onehot_labels),
                                     y_score=np.array(predicted_onehot_scores),
                                     average='micro')

            # Calculate the average PR
            test_prc = average_precision_score(
                y_true=np.array(true_onehot_labels),
                y_score=np.array(predicted_onehot_scores),
                average="micro")
            test_prc_first = average_precision_score(
                y_true=np.array(true_onehot_first_labels),
                y_score=np.array(predicted_onehot_scores_first),
                average="micro")
            test_prc_second = average_precision_score(
                y_true=np.array(true_onehot_second_labels),
                y_score=np.array(predicted_onehot_scores_second),
                average="micro")
            test_prc_third = average_precision_score(
                y_true=np.array(true_onehot_third_labels),
                y_score=np.array(predicted_onehot_scores_third),
                average="micro")
            test_prc_fourth = average_precision_score(
                y_true=np.array(true_onehot_fourth_labels),
                y_score=np.array(predicted_onehot_scores_fourth),
                average="micro")

            test_loss = float(test_loss / test_counter)

            logger.info(
                "All Test Dataset: Loss {0:g} | AUC {1:g} | AUPRC {2:g}".
                format(test_loss, test_auc, test_prc))
            # Predict by threshold
            logger.info(
                "Predict by threshold: Precision {0:g}, Recall {1:g}, F1 {2:g}"
                .format(test_pre_ts, test_rec_ts, test_F1_ts))

            logger.info(
                "Predict by threshold in Level-1: Precision {0:g}, Recall {1:g}, F1 {2:g}, AUPRC {3:g}"
                .format(test_pre_first, test_rec_first, test_F1_first,
                        test_prc_first))
            logger.info(
                "Predict by threshold in Level-2: Precision {0:g}, Recall {1:g}, F1 {2:g}, AUPRC {3:g}"
                .format(test_pre_second, test_rec_second, test_F1_second,
                        test_prc_second))
            logger.info(
                "Predict by threshold in Level-3: Precision {0:g}, Recall {1:g}, F1 {2:g}, AUPRC {3:g}"
                .format(test_pre_third, test_rec_third, test_F1_third,
                        test_prc_third))
            logger.info(
                "Predict by threshold in Level-4: Precision {0:g}, Recall {1:g}, F1 {2:g}, AUPRC {3:g}"
                .format(test_pre_fourth, test_rec_fourth, test_F1_fourth,
                        test_prc_fourth))

            # Save the prediction result
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR)
            dh.create_prediction_file(output_file=SAVE_DIR +
                                      "/predictions.json",
                                      data_id=test_data.patent_id,
                                      all_labels=true_labels,
                                      all_predict_labels=predicted_labels,
                                      all_predict_scores=predicted_scores)

    logger.info("All Done.")
Beispiel #7
0
                cnn.input_x: x_batch,
                cnn.input_y: y_batch,
                cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy, precision, recall = sess.run([
                global_step, dev_summary_op, cnn.loss, cnn.accuracy,
                cnn.precision, cnn.recall
            ], feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}, prec {:g}, recl {:g}".
                  format(time_str, step, loss, accuracy, precision, recall))
            if writer:
                writer.add_summary(summaries, step)

        # Generate batches
        batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                          FLAGS.batch_size, FLAGS.num_epochs)

        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(x_dev, y_dev)  #, writer=dev_summary_writer)
                print("")
            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess,
                                  checkpoint_prefix,
                                  global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))
Beispiel #8
0
def test_hmidp():
    """Test HMIDP model."""

    # Load data
    logger.info("✔︎ Loading data...")
    logger.info("Recommended padding Sequence length is: {0}".format(
        FLAGS.pad_seq_len))

    logger.info("✔︎ Test data processing...")
    test_data = dh.load_data_and_labels(FLAGS.test_data_file,
                                        FLAGS.embedding_dim,
                                        data_aug_flag=False)

    logger.info("✔︎ Test data padding...")
    x_test_content, x_test_question, x_test_option, y_test = dh.pad_data(
        test_data, FLAGS.pad_seq_len)

    # Load hmidp model
    BEST_OR_LATEST = input("☛ Load Best or Latest Model?(B/L): ")

    while not (BEST_OR_LATEST.isalpha()
               and BEST_OR_LATEST.upper() in ['B', 'L']):
        BEST_OR_LATEST = input(
            "✘ The format of your input is illegal, please re-input: ")
    if BEST_OR_LATEST.upper() == 'B':
        logger.info("✔︎ Loading best model...")
        checkpoint_file = cm.get_best_checkpoint(FLAGS.best_checkpoint_dir,
                                                 select_maximum_value=True)
    else:
        logger.info("✔︎ Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x_content = graph.get_operation_by_name(
                "input_x_content").outputs[0]
            input_x_question = graph.get_operation_by_name(
                "input_x_question").outputs[0]
            input_x_option = graph.get_operation_by_name(
                "input_x_option").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            scores = graph.get_operation_by_name("output/scores").outputs[0]
            loss = graph.get_operation_by_name("loss/loss").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = "output/scores"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(
                sess, sess.graph_def, output_node_names.split("|"))
            tf.train.write_graph(output_graph_def,
                                 "graph",
                                 "graph-hmidp-{0}.pb".format(MODEL),
                                 as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(
                zip(x_test_content, x_test_question, x_test_option, y_test)),
                                    FLAGS.batch_size,
                                    1,
                                    shuffle=False)

            test_counter, test_loss = 0, 0.0

            # Collect the predictions here
            true_labels = []
            predicted_scores = []

            for batch_test in batches:
                x_batch_content, x_batch_question, x_batch_option, y_batch = zip(
                    *batch_test)
                feed_dict = {
                    input_x_content: x_batch_content,
                    input_x_question: x_batch_question,
                    input_x_option: x_batch_option,
                    input_y: y_batch,
                    dropout_keep_prob: 1.0,
                    is_training: False
                }
                batch_scores, cur_loss = sess.run([scores, loss], feed_dict)

                # Prepare for calculating metrics
                for i in y_batch:
                    true_labels.append(i)
                for j in batch_scores:
                    predicted_scores.append(j)

                test_loss = test_loss + cur_loss
                test_counter = test_counter + 1

            # Calculate PCC & DOA
            pcc, doa = dh.evaluation(true_labels, predicted_scores)
            # Calculate RMSE
            rmse = mean_squared_error(true_labels, predicted_scores)**0.5

            test_loss = float(test_loss / test_counter)

            logger.info(
                "☛ All Test Dataset: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {1:g}"
                .format(test_loss, pcc, doa, rmse))

            # Save the prediction result
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR)
            dh.create_prediction_file(output_file=SAVE_DIR +
                                      "/predictions.json",
                                      all_id=test_data.id,
                                      all_labels=true_labels,
                                      all_predict_scores=predicted_scores)

    logger.info("✔︎ Done.")
def train():
    print("  >> Loading preprocessing information...", "\n")
    parameters, data_info = load_preprocessing()

    print("  >> Loading Train Data...", "\n")
    train_data = data_info.train_data

    train_loss_history = []
    train_acc_history = []

    session_conf = tf.ConfigProto()
    session_conf.gpu_options.allow_growth = True
    with tf.Session(config=session_conf) as sess:
        Model = create_model(sess, parameters, data_info)

        if Model.global_epoch_step.eval() + 1 > parameters['n_epoch']:
            print("  >> Current Epoch: {}, Max Epoch: {}".format(
                Model.global_epoch_step.eval(), parameters['n_epoch']))
            print("  >> End of Training....")
            exit(-1)

        for epoch_idx in range(parameters['n_epoch']):
            try:
                batches = data_helpers.batch_iter(parameters, train_data)
                for minibatch in batches:
                    #print (minibatch)
                    input_indices, target_indices = data_helpers.get_minibatch(\
                            dataset = train_data,\
                            minibatch_seq = minibatch)

                    feed_dict = {
                        Model.X     :   input_indices,\
                        Model.Y     :   target_indices}

                    #   Training model......
                    _, global_step, minibatch_loss, minibatch_accuracy = sess.run(\
                        [Model.train_op, Model.global_step, Model.loss, Model.accuracy], feed_dict)

                    #   Check Training Process
                    if (global_step + 1) % parameters['evaluation_every'] == 0:

                        # 매 "evaluation_every" step마다 train의 결과를 저장!!!
                        train_loss_history.append(minibatch_loss)
                        train_acc_history.append(minibatch_accuracy)

                        print("  >> Global_Step # {} at {}-epoch".format(
                            global_step, Model.global_epoch_step.eval()))
                        print("        - Train Loss    : {:,.2f}".format(
                            minibatch_loss))
                        print("        - Train Accuracy: {:,.2f}".format(
                            minibatch_accuracy))
                        print("")

                Model.global_epoch_step_op.eval()  #Increment Global_epoch_step

            except KeyboardInterrupt:
                print("     - Training Process Terminated....")
                exit(-1)

        print("  >> Save the last model...")
        saver = tf.train.Saver()
        checkpoint_path = os.path.join(parameters['save_dir'], 'MLP.ckpt')
        saver.save(sess, checkpoint_path, global_step=global_step)

    print("  >> End of Training...")
    print("")
    print("")
Beispiel #10
0
with graph.as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        checkpoint_file = '/home/ameex/ML/projects/text-classification/runs/1532540562/checkpoints/model-700'
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        input_x = graph.get_operation_by_name("input_x").outputs[0]
        dropout_keep_prob = graph.get_operation_by_name(
            "dropout_keep_prob").outputs[0]
        scores = graph.get_operation_by_name("output/scores").outputs[0]
        predictions = graph.get_operation_by_name(
            "output/predictions").outputs[0]
        batches = data_helpers.batch_iter(list(x_test), 37, 1, shuffle=False)
        all_predictions = []
        all_probabilities = None
        batch_predictions_scores = sess.run([predictions, scores], {
            input_x: x_test,
            dropout_keep_prob: 1.0
        })
        all_predictions = np.concatenate(
            [all_predictions, batch_predictions_scores[0]])
        probabilities = softmax(batch_predictions_scores[1])
        print(all_predictions)
        print(all_predictions.shape)
        print(max(all_predictions))

        for index, x_test_batch in enumerate(batches):
            print(x_test_batch)
Beispiel #11
0
def main(_):
    """
    主体函数代码构建
    :param _:
    :return:
    """
    # -1. 模型持久化文件夹检查
    check_directory(FLAGS.checkpoint_dir)
    checkpoint_save_path = os.path.join(FLAGS.checkpoint_dir, "model.ckpt")
    # 0. 数据校验,要求训练数据文件文件存在
    if not (os.path.isfile(FLAGS.positive_data_file)
            and os.path.isfile(FLAGS.negative_data_file)):
        raise Exception("给定的训练数据必须是文件路径的形成!!!")

    # 1. 加载词汇转换模型
    vocab_model_path = FLAGS.vocab_model_path
    if not tf.gfile.Exists(vocab_model_path):
        raise Exception("词汇转换模型必须存在,请检查磁盘路径:{}".format(vocab_model_path))
    vocab_model = VocabularyProcessorUtil.load_model(
        save_path=vocab_model_path)

    # 2. TensorFlow相关代码构建
    with tf.Graph().as_default():
        # 一、执行图的构建
        # 1. 网络构建(前向执行构成的构建)
        tf.logging.info("开始构建前向过程的网络结构....")
        # 获取使用什么模型以及模型的名称
        model_name = FLAGS.model.lower()
        model_class = load_model_class(model_name)
        network_name = model_name.upper(
        ) if FLAGS.network_name is None else FLAGS.network_name
        # 构建Embedding Lookup相关参数
        with_word2vec = False
        embedding_table = None
        if FLAGS.with_word2vec:
            if os.path.exists(FLAGS.word2vec_model_path):
                tf.logging.info("加载Word2Vec训练好的词向量转换模型!!!")
                embedding_table, _ = VocabularyProcessorUtil.load_word2vec_embedding(
                    save_path=FLAGS.word2vec_model_path)
                with_word2vec = True
            else:
                tf.logging.warn("不能加载Word2Vec词向量转换矩阵,原因是文件不存在,请检查:{}".format(
                    FLAGS.word2vec_model_path))
        # 构建TextCNN的卷积相关参数(卷积核数量、卷积的范围)
        num_filters = list(map(int, FLAGS.num_filters.split(",")))
        num_filters = num_filters if len(num_filters) > 1 else num_filters[0]
        region_sizes = list(map(int, FLAGS.region_sizes.split(",")))
        # 构建模型
        model = model_class(
            with_word2vec=with_word2vec,  # 是否做Word2Vec
            vocab_size=len(vocab_model.vocabulary_),  # 词汇数目
            embedding_dimensions=FLAGS.
            embedding_dimensions,  # Embedding Loopup转换的向量维度大小
            embedding_table=embedding_table,  # 做Word2Vec的初始化向量矩阵
            train_embedding_table=FLAGS.
            train_embedding_table,  # 是否进行Embedding Table的训练
            num_class=FLAGS.num_class,  # 类别数目
            network_name=network_name,  # 网络名称
            weight_decay=FLAGS.l2_weight_decay,  # 正则惩罚项系数
            optimizer_type=FLAGS.optimizer_type,  # 优化器类别
            optimizer_parameters_func=network_utils.
            build_optimizer_parameters_func(flags=FLAGS),
            saver_parameters={'max_to_keep': FLAGS.max_to_keep},
            num_units=FLAGS.num_units,  # RNN的神经元数目
            layers=FLAGS.layers,  # RNN的层次数目
            num_filters=num_filters,  # TextCNN卷积核的数目
            region_sizes=region_sizes,  # TextCNN中卷积提取特征的时候单词的范围大小
            attention_dimension_size=FLAGS.
            attention_dimension_size,  # Self-Attention的输出维度大小
            attention_layers=FLAGS.
            attention_layers,  # Transformer中Encoder Attention的层次数目
            attention_headers=FLAGS.attention_headers,  # Attention中header的数目
        )

        # 2. 计算损失函数
        tf.logging.info("开始构建损失函数对象....")
        total_loss = model.losses()
        # 3. 构建优化器以及训练操作对象
        tf.logging.info("开始构建模型训练操作对象.....")
        _, train_op = model.optimizer(loss=total_loss)
        # 4. 模型评估指标的构建
        tf.logging.info("开始构建模型评估指标.....")
        metrics = model.metrics()
        # 5. 模型可视化相关操作
        tf.logging.info("开始构建模型可视化相关信息.....")
        summary_op = tf.summary.merge_all()
        train_summary_dir = os.path.join(FLAGS.summary_dir, "train")
        eval_summary_dir = os.path.join(FLAGS.summary_dir, "eval")
        check_directory(train_summary_dir)
        check_directory(eval_summary_dir)
        train_summary_writer = tf.summary.FileWriter(
            logdir=train_summary_dir, graph=tf.get_default_graph())
        eval_summary_writer = tf.summary.FileWriter(
            logdir=eval_summary_dir, graph=tf.get_default_graph())

        # 二、执行图的运行
        session_config = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        with tf.Session(config=session_config) as sess:
            # 1. 模型参数恢复(先初始化所有模型参数,然后再进行模型参数恢复)
            tf.logging.info("进行模型参数初始化相关操作......")
            sess.run(tf.global_variables_initializer())  # 参数随机初始化
            model.restore(checkpoint_dir=FLAGS.checkpoint_dir,
                          session=sess)  # 参数恢复

            # 2. 加载数据
            tf.logging.info("开始加载文本数据,并转换处理......")
            texts, labels = data_helpers.load_data_and_labels(
                positive_data_file=FLAGS.positive_data_file,
                negative_data_file=FLAGS.negative_data_file)
            # 2a. 文本数据id转换(截取、填充,只保留512位)
            texts = np.asarray(list(vocab_model.transform(texts)))
            # 2b. 将数据进行分割,构建训练数据和验证数据
            x_train, x_eval, y_train, y_eval = train_test_split(
                texts,
                labels,
                test_size=FLAGS.dev_sample_percentage,
                random_state=28)
            tf.logging.info("训练数据格式:{}---{}".format(np.shape(x_train),
                                                    np.shape(y_train)))
            tf.logging.info("验证数据格式:{}---{}".format(np.shape(x_eval),
                                                    np.shape(y_eval)))
            # 2c. 将数据做一个封装转换为批次迭代器
            batches = data_helpers.batch_iter(
                data=list(zip(x_train, y_train)),
                batch_size=FLAGS.batch_size,  # 每个批次的样本数据量
                num_epochs=FLAGS.max_epochs  # 总共迭代多少个epoch数据
            )

            def train_step(_x, _y, writer):
                feed_dict = {
                    model.inputs: _x,
                    model.targets: _y,
                    model.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                step, _, _accuracy, _loss, _summary = sess.run(
                    [
                        model.global_step, train_op, metrics.accuracy,
                        total_loss, summary_op
                    ],
                    feed_dict=feed_dict)
                time_str = datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, _loss, _accuracy))
                writer.add_summary(_summary, step)
                return step

            def dev_step(_x, _y, writer=None):
                feed_dict = {model.inputs: _x, model.targets: _y}
                step, _accuracy, _loss, _summary = sess.run(
                    [
                        model.global_step, metrics.accuracy, total_loss,
                        summary_op
                    ],
                    feed_dict=feed_dict)
                time_str = datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, _loss, _accuracy))
                if writer is not None:
                    writer.add_summary(_summary,
                                       global_step=dev_summary_global_step)

            # 3. 遍历数据进行模型训练、模型持久化
            for batch in batches:
                # a. 从batch中提取x和y
                x_batch, y_batch = zip(*batch)
                # b. 模型训练
                current_step = train_step(x_batch,
                                          y_batch,
                                          writer=train_summary_writer)
                # c. 每隔一定的训练间隔,进行验证数据的效果评估
                if current_step % FLAGS.evaluate_per_batch == 0:
                    print("\nEvaluation:")
                    dev_summary_global_step = current_step
                    dev_batches = data_helpers.batch_iter(
                        data=list(zip(x_eval, y_eval)),
                        batch_size=FLAGS.batch_size * 10,  # 每个批次的样本数据量
                        num_epochs=1  # 总共迭代多少个epoch数据
                    )
                    for dev_batch in dev_batches:
                        dev_x_batch, dev_y_batch = zip(*dev_batch)
                        dev_step(dev_x_batch,
                                 dev_y_batch,
                                 writer=eval_summary_writer)
                        dev_summary_global_step += 1
                # d. 每个一定的训练间隔,进行模型持久化
                if current_step % FLAGS.checkpoint_per_batch == 0:
                    model.save(session=sess, save_path=checkpoint_save_path)

            # 最终结束的时候再保存一次模型
            model.save(session=sess, save_path=checkpoint_save_path)
Beispiel #12
0
def test_fasttext():
    """Test FASTTEXT model."""

    # Load data
    logger.info("✔ Loading data...")
    logger.info('Recommended padding Sequence length is: {0}'.format(
        FLAGS.pad_seq_len))

    logger.info('✔︎ Test data processing...')
    test_data = dh.load_data_and_labels(FLAGS.test_data_file,
                                        FLAGS.num_classes, FLAGS.embedding_dim)

    logger.info('✔︎ Test data padding...')
    x_test, y_test = dh.pad_data(test_data, FLAGS.pad_seq_len)
    y_test_bind = test_data.labels_bind

    # Build vocabulary
    VOCAB_SIZE = dh.load_vocab_size(FLAGS.embedding_dim)
    pretrained_word2vec_matrix = dh.load_word2vec_matrix(
        VOCAB_SIZE, FLAGS.embedding_dim)

    # Load fasttext model
    logger.info("✔ Loading model...")
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]

            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # pre-trained_word2vec
            pretrained_embedding = graph.get_operation_by_name(
                "embedding/embedding").outputs[0]

            # Tensors we want to evaluate
            logits = graph.get_operation_by_name("output/logits").outputs[0]

            # Generate batches for one epoch
            batches = dh.batch_iter(list(zip(x_test, y_test, y_test_bind)),
                                    FLAGS.batch_size,
                                    1,
                                    shuffle=False)

            # Collect the predictions here
            all_predicitons = []
            eval_loss, eval_rec, eval_acc, eval_counter = 0.0, 0.0, 0.0, 0
            for batch_test in batches:
                x_batch_test, y_batch_test, y_batch_test_bind = zip(
                    *batch_test)
                feed_dict = {input_x: x_batch_test, dropout_keep_prob: 1.0}
                batch_logits = sess.run(logits, feed_dict)

                if FLAGS.use_classbind_or_not == 'Y':
                    predicted_labels = dh.get_label_using_logits_and_classbind(
                        batch_logits,
                        y_batch_test_bind,
                        top_number=FLAGS.top_num)
                if FLAGS.use_classbind_or_not == 'N':
                    predicted_labels = dh.get_label_using_logits(
                        batch_logits, top_number=FLAGS.top_num)

                all_predicitons = np.append(all_predicitons, predicted_labels)
                cur_rec, cur_acc = 0.0, 0.0
                for index, predicted_label in enumerate(predicted_labels):
                    rec_inc, acc_inc = dh.cal_rec_and_acc(
                        predicted_label, y_batch_test[index])
                    cur_rec, cur_acc = cur_rec + rec_inc, cur_acc + acc_inc

                cur_rec = cur_rec / len(y_batch_test)
                cur_acc = cur_acc / len(y_batch_test)

                eval_rec, eval_acc, eval_counter = eval_rec + cur_rec, eval_acc + cur_acc, eval_counter + 1
                logger.info(
                    "✔︎ validation batch {0} finished.".format(eval_counter))

            eval_rec = float(eval_rec / eval_counter)
            eval_acc = float(eval_acc / eval_counter)
            logger.info("☛ Recall {0:g}, Accuracy {1:g}".format(
                eval_rec, eval_acc))
            np.savetxt(SAVE_FILE, list(zip(all_predicitons)), fmt='%s')

    logger.info("✔ Done.")
Beispiel #13
0
def test_cnn():
    """Test CNN model."""

    # Load data
    logger.info("✔ Loading data...")
    logger.info('Recommended padding Sequence length is: {0}'.format(
        FLAGS.pad_seq_len))

    logger.info('✔︎ Test data processing...')
    test_data = dh.load_data_and_labels(FLAGS.test_data_file,
                                        FLAGS.embedding_dim)

    logger.info('✔︎ Test data padding...')
    x_test_front, x_test_behind, y_test = dh.pad_data(test_data,
                                                      FLAGS.pad_seq_len)

    # Build vocabulary
    VOCAB_SIZE = dh.load_vocab_size(FLAGS.embedding_dim)
    pretrained_word2vec_matrix = dh.load_word2vec_matrix(
        VOCAB_SIZE, FLAGS.embedding_dim)

    # Load cnn model
    logger.info("✔ Loading model...")
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x_front = graph.get_operation_by_name(
                "input_x_front").outputs[0]
            input_x_behind = graph.get_operation_by_name(
                "input_x_behind").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # pre-trained word2vec
            pretrained_embedding = graph.get_operation_by_name(
                "embedding/embedding").outputs[0]

            # Tensors we want to evaluate
            scores = graph.get_operation_by_name("output/scores").outputs
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]
            softmax_scores = graph.get_operation_by_name(
                "output/SoftMax_scores").outputs[0]
            topKPreds = graph.get_operation_by_name(
                "output/topKPreds").outputs[0]
            accuracy = graph.get_operation_by_name(
                "accuracy/accuracy").outputs[0]
            loss = graph.get_operation_by_name("loss/loss").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = 'output/scores|output/predictions|output/SoftMax_scores|output/topKPreds'

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(
                sess, sess.graph_def, output_node_names.split("|"))
            tf.train.write_graph(output_graph_def,
                                 'graph',
                                 'graph-cnn-{0}.pb'.format(MODEL_LOG),
                                 as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(
                zip(x_test_front, x_test_behind, y_test)),
                                    FLAGS.batch_size,
                                    1,
                                    shuffle=False)

            # Collect the predictions here
            all_scores = []
            all_softmax_scores = []
            all_predictions = []
            all_topKPreds = []

            for index, x_test_batch in enumerate(batches):
                x_batch_front, x_batch_behind, y_batch = zip(*x_test_batch)
                feed_dict = {
                    input_x_front: x_batch_front,
                    input_x_behind: x_batch_behind,
                    input_y: y_batch,
                    dropout_keep_prob: 1.0,
                    is_training: False
                }
                batch_scores = sess.run(scores, feed_dict)
                all_scores = np.append(all_scores, batch_scores)

                batch_softmax_scores = sess.run(softmax_scores, feed_dict)
                all_softmax_scores = np.append(all_softmax_scores,
                                               batch_softmax_scores)

                batch_predictions = sess.run(predictions, feed_dict)
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])

                batch_topKPreds = sess.run(topKPreds, feed_dict)
                all_topKPreds = np.append(all_topKPreds, batch_topKPreds)

                batch_loss = sess.run(loss, feed_dict)
                batch_acc = sess.run(accuracy, feed_dict)

                logger.info(
                    "✔︎ Test batch {0}: loss {1:g}, accuracy {2:g}.".format(
                        (index + 1), batch_loss, batch_acc))

            os.makedirs(SAVE_DIR)
            np.savetxt(SAVE_DIR + '/result_sub_' + SUBSET + '.txt',
                       list(zip(all_predictions, all_topKPreds)),
                       fmt='%s')

    logger.info("✔ Done.")
def visualize():
    """visualize HARNN model."""

    # Load data
    logger.info("✔︎ Loading data...")
    logger.info("Recommended padding Sequence length is: {0}".format(FLAGS.pad_seq_len))

    logger.info("✔︎ Test data processing...")
    test_data = dh.load_data_and_labels(FLAGS.test_data_file, FLAGS.num_classes_list, FLAGS.total_classes,
                                        FLAGS.embedding_dim, data_aug_flag=False)

    logger.info("✔︎ Test data padding...")
    x_test, y_test, y_test_tuple = dh.pad_data(test_data, FLAGS.pad_seq_len)
    x_test_content, y_test_labels = test_data.abstract_content, test_data.labels

    # Load harnn model
    BEST_OR_LATEST = input("☛ Load Best or Latest Model?(B/L): ")

    while not (BEST_OR_LATEST.isalpha() and BEST_OR_LATEST.upper() in ['B', 'L']):
        BEST_OR_LATEST = input("✘ The format of your input is illegal, please re-input: ")
    if BEST_OR_LATEST.upper() == 'B':
        logger.info("✔︎ Loading best model...")
        checkpoint_file = cm.get_best_checkpoint(FLAGS.best_checkpoint_dir, select_maximum_value=True)
    else:
        logger.info("✔︎ Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            input_y_first = graph.get_operation_by_name("input_y_first").outputs[0]
            input_y_second = graph.get_operation_by_name("input_y_second").outputs[0]
            input_y_third = graph.get_operation_by_name("input_y_third").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
            beta = graph.get_operation_by_name("beta").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            first_visual = graph.get_operation_by_name("first-output/visual").outputs[0]
            second_visual = graph.get_operation_by_name("second-output/visual").outputs[0]
            third_visual = graph.get_operation_by_name("third-output/visual").outputs[0]
            scores = graph.get_operation_by_name("output/scores").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = "first-output/visual|second-output/visual|third-output/visual|output/scores"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def,
                                                                            output_node_names.split("|"))
            tf.train.write_graph(output_graph_def, "graph", "graph-harnn-{0}.pb".format(MODEL), as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(zip(x_test, y_test, y_test_tuple, x_test_content, y_test_labels)),
                                    FLAGS.batch_size, 1, shuffle=False)

            for batch_test in batches:
                x_batch_test, y_batch_test, y_batch_test_tuple, \
                x_batch_test_content, y_batch_test_labels = zip(*batch_test)

                y_batch_test_first = [i[0] for i in y_batch_test_tuple]
                y_batch_test_second = [j[1] for j in y_batch_test_tuple]
                y_batch_test_third = [k[2] for k in y_batch_test_tuple]

                feed_dict = {
                    input_x: x_batch_test,
                    input_y_first: y_batch_test_first,
                    input_y_second: y_batch_test_second,
                    input_y_third: y_batch_test_third,
                    input_y: y_batch_test,
                    dropout_keep_prob: 1.0,
                    beta: FLAGS.beta,
                    is_training: False
                }
                batch_first_visual, batch_second_visual, batch_third_visual, batch_scores = \
                    sess.run([first_visual, second_visual, third_visual, scores], feed_dict)

                seq_len = len(x_batch_test_content[0])
                pad_len = len(batch_first_visual[0])

                if seq_len >= pad_len:
                    length = pad_len
                else:
                    length = seq_len

                # print(seq_len, pad_len, length)
                final_first_visual = normalization(batch_first_visual[0].tolist(), length)
                final_second_visual = normalization(batch_second_visual[0].tolist(), length)
                final_third_visual = normalization(batch_third_visual[0].tolist(), length)

                visual_list = [final_first_visual, final_second_visual, final_third_visual]
                print(visual_list)

                f = open('attention.html', 'w')
                f.write('<html style="margin:0;padding:0;"><body style="margin:0;padding:0;">\n')
                f.write('<div style="margin:25px;">\n')
                for k in range(len(visual_list)):
                    f.write('<p style="margin:10px;">\n')
                    for i in range(seq_len):
                        alpha = "{:.2f}".format(visual_list[k][i])
                        word = x_batch_test_content[0][i]
                        f.write('\t<span style="margin-left:3px;background-color:rgba(255,0,0,{0})">{1}</span>\n'
                                .format(alpha, word))
                    f.write('</p>\n')
                f.write('</div>\n')
                f.write('</body></html>')
                f.close()

    logger.info("✔︎ Done.")
def validate_cnn():
    """Test CNN model."""

    # Load data
    logger.info("t Loading data...")
    logger.info("Recommended padding Sequence length is: {0}".format(FLAGS.pad_seq_len))

    logger.info("t Test data processing...")
    test_data = dh.load_data_and_labels(FLAGS.test_data_file, FLAGS.num_classes,
                                        FLAGS.embedding_dim, data_aug_flag=False)

    logger.info("t Test data padding...")
    x_test, y_test = dh.pad_data(test_data, FLAGS.pad_seq_len)
    y_test_labels = test_data.labels

    # Load cnn model
    BEST_OR_LATEST = input("☛ Load Best or Latest Model?(B/L): ")

    while not (BEST_OR_LATEST.isalpha() and BEST_OR_LATEST.upper() in ['B', 'L']):
        BEST_OR_LATEST = input("✘ The format of your input is illegal, please re-input: ")
    if BEST_OR_LATEST.upper() == 'B':
        logger.info("t Loading best model...")
        checkpoint_file = cm.get_best_checkpoint(FLAGS.best_checkpoint_dir, select_maximum_value=True)
    else:
        logger.info("t Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            scores = graph.get_operation_by_name("output/scores").outputs[0]
            loss = graph.get_operation_by_name("loss/loss").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = "output/scores"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def,
                                                                            output_node_names.split("|"))
            tf.train.write_graph(output_graph_def, "graph", "graph-cnn-{0}.pb".format(MODEL), as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(zip(x_test, y_test, y_test_labels)), FLAGS.batch_size, 1, shuffle=False)

            test_counter, test_loss = 0, 0.0

            test_pre_tk = [0.0] * FLAGS.top_num
            test_rec_tk = [0.0] * FLAGS.top_num
            test_F_tk = [0.0] * FLAGS.top_num

            # Collect the predictions here
            true_labels = []
            predicted_labels = []
            predicted_scores = []

            # Collect for calculating metrics
            true_onehot_labels = []
            predicted_onehot_scores = []
            predicted_onehot_labels_ts = []
            predicted_onehot_labels_tk = [[] for _ in range(FLAGS.top_num)]

            for batch_test in batches:
                x_batch_test, y_batch_test, y_batch_test_labels = zip(*batch_test)
                feed_dict = {
                    input_x: x_batch_test,
                    input_y: y_batch_test,
                    dropout_keep_prob: 1.0,
                    is_training: False
                }
                batch_scores, cur_loss = sess.run([scores, loss], feed_dict)

                # Prepare for calculating metrics
                for i in y_batch_test:
                    true_onehot_labels.append(i)
                for j in batch_scores:
                    predicted_onehot_scores.append(j)

                # Get the predicted labels by threshold
                batch_predicted_labels_ts, batch_predicted_scores_ts = \
                    dh.get_label_threshold(scores=batch_scores, threshold=FLAGS.threshold)

                # Add results to collection
                for i in y_batch_test_labels:
                    true_labels.append(i)
                for j in batch_predicted_labels_ts:
                    predicted_labels.append(j)
                for k in batch_predicted_scores_ts:
                    predicted_scores.append(k)

                # Get onehot predictions by threshold
                batch_predicted_onehot_labels_ts = \
                    dh.get_onehot_label_threshold(scores=batch_scores, threshold=FLAGS.threshold)
                for i in batch_predicted_onehot_labels_ts:
                    predicted_onehot_labels_ts.append(i)

                # Get onehot predictions by topK
                for top_num in range(FLAGS.top_num):
                    batch_predicted_onehot_labels_tk = dh.get_onehot_label_topk(scores=batch_scores, top_num=top_num+1)

                    for i in batch_predicted_onehot_labels_tk:
                        predicted_onehot_labels_tk[top_num].append(i)

                test_loss = test_loss + cur_loss
                test_counter = test_counter + 1

            # Calculate Precision & Recall & F1 (threshold & topK)
            test_pre_ts = precision_score(y_true=np.array(true_onehot_labels),
                                          y_pred=np.array(predicted_onehot_labels_ts), average='micro')
            test_rec_ts = recall_score(y_true=np.array(true_onehot_labels),
                                       y_pred=np.array(predicted_onehot_labels_ts), average='micro')
            test_F_ts = f1_score(y_true=np.array(true_onehot_labels),
                                 y_pred=np.array(predicted_onehot_labels_ts), average='micro')
            test_hmloss = hamming_loss(y_true=np.array(true_onehot_labels),
                                       y_pred=np.array(predicted_onehot_labels_ts))
            test_rocauc = roc_auc_score(y_true=np.array(true_onehot_labels),
                                        y_score=np.array(predicted_onehot_scores))
            test_subsetacc = accuracy_score(y_true=np.array(true_onehot_labels),
                                            y_pred=np.array(predicted_onehot_labels_ts))
            test_labelrankap = label_ranking_average_precision_score(y_true=np.array(true_onehot_labels),
                                                                     y_score=np.array(predicted_onehot_scores))

            for top_num in range(FLAGS.top_num):
                test_pre_tk[top_num] = precision_score(y_true=np.array(true_onehot_labels),
                                                       y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                                                       average='micro')
                test_rec_tk[top_num] = recall_score(y_true=np.array(true_onehot_labels),
                                                    y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                                                    average='micro')
                test_F_tk[top_num] = f1_score(y_true=np.array(true_onehot_labels),
                                              y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                                              average='micro')

            # Calculate the average AUC
            test_auc = roc_auc_score(y_true=np.array(true_onehot_labels),
                                     y_score=np.array(predicted_onehot_scores), average='micro')

            # Calculate the average PR
            test_prc = average_precision_score(y_true=np.array(true_onehot_labels),
                                               y_score=np.array(predicted_onehot_scores), average="micro")
            test_loss = float(test_loss / test_counter)

            logger.info("All Test Dataset: Loss {0:g} | AUC {1:g} | AUPRC {2:g}"
                        .format(test_loss, test_auc, test_prc))

            # Predict by threshold
            logger.info("Predict by threshold: Precision {0:g}, Recall {1:g}, F1 {2:g}"
                        .format(test_pre_ts, test_rec_ts, test_F_ts))
            logger.info("predict by threshold: hamming_loss {0:g}, roc_auc {1:g}, subset_acc {2:g}, label_rank_av_pr {3:g}"
                        .format(test_hmloss, test_rocauc, test_subsetacc, test_labelrankap))

            # Predict by topK
            logger.info("Predict by topK:")
            for top_num in range(FLAGS.top_num):
                logger.info("Top{0}: Precision {1:g}, Recall {2:g}, F {3:g}"
                            .format(top_num + 1, test_pre_tk[top_num], test_rec_tk[top_num], test_F_tk[top_num]))

            # Save the prediction result
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR)
            dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", data_id=test_data.testid,
                                      all_labels=true_labels, all_predict_labels=predicted_labels,
                                      all_predict_scores=predicted_scores)

    logger.info("Done.")
Beispiel #16
0
            def validation_step(x_validation, y_validation, writer=None):
                """Evaluates model on a validation set"""
                batches_validation = dh.batch_iter(
                    list(zip(x_validation, y_validation)), FLAGS.batch_size,
                    FLAGS.num_epochs)

                # Predict classes by threshold or topk ('ts': threshold; 'tk': topk)
                eval_counter, eval_loss, eval_rec_ts, eval_acc_ts, eval_F_ts = 0, 0.0, 0.0, 0.0, 0.0
                eval_rec_tk = [0.0] * FLAGS.top_num
                eval_acc_tk = [0.0] * FLAGS.top_num
                eval_F_tk = [0.0] * FLAGS.top_num

                for batch_validation in batches_validation:
                    x_batch_validation, y_batch_validation = zip(
                        *batch_validation)
                    feed_dict = {
                        ann.input_x: x_batch_validation,
                        ann.input_y: y_batch_validation,
                        ann.dropout_keep_prob: 1.0,
                        ann.is_training: False
                    }
                    step, summaries, scores, cur_loss = sess.run([
                        ann.global_step, validation_summary_op, ann.scores,
                        ann.loss
                    ], feed_dict)

                    # Predict by threshold
                    predicted_labels_threshold, predicted_values_threshold = \
                        dh.get_label_using_scores_by_threshold(scores=scores, threshold=FLAGS.threshold)

                    cur_rec_ts, cur_acc_ts, cur_F_ts = 0.0, 0.0, 0.0

                    for index, predicted_label_threshold in enumerate(
                            predicted_labels_threshold):
                        rec_inc_ts, acc_inc_ts, F_inc_ts = dh.cal_metric(
                            predicted_label_threshold,
                            y_batch_validation[index])
                        cur_rec_ts, cur_acc_ts, cur_F_ts = cur_rec_ts + rec_inc_ts, \
                                                           cur_acc_ts + acc_inc_ts, \
                                                           cur_F_ts + F_inc_ts

                    cur_rec_ts = cur_rec_ts / len(y_batch_validation)
                    cur_acc_ts = cur_acc_ts / len(y_batch_validation)
                    cur_F_ts = cur_F_ts / len(y_batch_validation)

                    eval_rec_ts, eval_acc_ts, eval_F_ts = eval_rec_ts + cur_rec_ts, \
                                                          eval_acc_ts + cur_acc_ts, \
                                                          eval_F_ts + cur_F_ts

                    # Predict by topK
                    topK_predicted_labels = []
                    for top_num in range(FLAGS.top_num):
                        predicted_labels_topk, predicted_values_topk = \
                            dh.get_label_using_scores_by_topk(scores=scores, top_num=top_num+1)
                        topK_predicted_labels.append(predicted_labels_topk)

                    cur_rec_tk = [0.0] * FLAGS.top_num
                    cur_acc_tk = [0.0] * FLAGS.top_num
                    cur_F_tk = [0.0] * FLAGS.top_num

                    for top_num, predicted_labels_topK in enumerate(
                            topK_predicted_labels):
                        for index, predicted_label_topK in enumerate(
                                predicted_labels_topK):
                            rec_inc_tk, acc_inc_tk, F_inc_tk = dh.cal_metric(
                                predicted_label_topK,
                                y_batch_validation[index])
                            cur_rec_tk[top_num], cur_acc_tk[top_num], cur_F_tk[top_num] = \
                                cur_rec_tk[top_num] + rec_inc_tk, \
                                cur_acc_tk[top_num] + acc_inc_tk, \
                                cur_F_tk[top_num] + F_inc_tk

                        cur_rec_tk[top_num] = cur_rec_tk[top_num] / len(
                            y_batch_validation)
                        cur_acc_tk[top_num] = cur_acc_tk[top_num] / len(
                            y_batch_validation)
                        cur_F_tk[top_num] = cur_F_tk[top_num] / len(
                            y_batch_validation)

                        eval_rec_tk[top_num], eval_acc_tk[top_num], eval_F_tk[top_num] = \
                            eval_rec_tk[top_num] + cur_rec_tk[top_num], \
                            eval_acc_tk[top_num] + cur_acc_tk[top_num], \
                            eval_F_tk[top_num] + cur_F_tk[top_num]

                    eval_loss = eval_loss + cur_loss
                    eval_counter = eval_counter + 1

                    logger.info("✔︎ validation batch {0}: loss {1:g}".format(
                        eval_counter, cur_loss))
                    logger.info(
                        "︎☛ Predict by threshold: recall {0:g}, accuracy {1:g}, F {2:g}"
                        .format(cur_rec_ts, cur_acc_ts, cur_F_ts))

                    logger.info("︎☛ Predict by topK:")
                    for top_num in range(FLAGS.top_num):
                        logger.info(
                            "Top{0}: recall {1:g}, accuracy {2:g}, F {3:g}".
                            format(top_num + 1, cur_rec_tk[top_num],
                                   cur_acc_tk[top_num], cur_F_tk[top_num]))

                    if writer:
                        writer.add_summary(summaries, step)

                eval_loss = float(eval_loss / eval_counter)
                eval_rec_ts = float(eval_rec_ts / eval_counter)
                eval_acc_ts = float(eval_acc_ts / eval_counter)
                eval_F_ts = float(eval_F_ts / eval_counter)

                for top_num in range(FLAGS.top_num):
                    eval_rec_tk[top_num] = float(eval_rec_tk[top_num] /
                                                 eval_counter)
                    eval_acc_tk[top_num] = float(eval_acc_tk[top_num] /
                                                 eval_counter)
                    eval_F_tk[top_num] = float(eval_F_tk[top_num] /
                                               eval_counter)

                return eval_loss, eval_rec_ts, eval_acc_ts, eval_F_ts, eval_rec_tk, eval_acc_tk, eval_F_tk
Beispiel #17
0
    def test_model():
        model.eval()
        item_embedding = model.encode.weight
        dr_hidden = model.init_hidden(Config().batch_size)

        hitratio_numer = 0
        hitratio_denom = 0
        ndcg = 0.0

        for i, x in enumerate(
                dh.batch_iter(train_data,
                              Config().batch_size,
                              Config().seq_len,
                              shuffle=False)):
            uids, baskets, lens = x
            dynamic_user, _ = model(baskets, lens, dr_hidden)
            for uid, l, du in zip(uids, lens, dynamic_user):
                scores = []
                du_latest = du[l - 1].unsqueeze(0)

                # calculating <u,p> score for all test items <u,p> pair
                positives = test_data[test_data['userID'] ==
                                      uid].baskets.values[0]  # list dim 1
                p_length = len(positives)
                positives = torch.LongTensor(positives)

                # Deal with positives samples
                scores_pos = list(
                    torch.mm(du_latest,
                             item_embedding[positives].t()).data.numpy()[0])
                for s in scores_pos:
                    scores.append(s)

                # Deal with negative samples
                negtives = random.sample(list(neg_samples[uid]),
                                         Config().neg_num)
                negtives = torch.LongTensor(negtives)
                scores_neg = list(
                    torch.mm(du_latest,
                             item_embedding[negtives].t()).data.numpy()[0])
                for s in scores_neg:
                    scores.append(s)

                # Calculate hit-ratio
                index_k = []
                for k in range(Config().top_k):
                    index = scores.index(max(scores))
                    index_k.append(index)
                    scores[index] = -9999
                hitratio_numer += len(
                    (set(np.arange(0, p_length)) & set(index_k)))
                hitratio_denom += p_length

                # Calculate NDCG
                u_dcg = 0
                u_idcg = 0
                for k in range(Config().top_k):
                    if index_k[k] < p_length:  # 长度 p_length 内的为正样本
                        u_dcg += 1 / math.log(k + 1 + 1, 2)
                    u_idcg += 1 / math.log(k + 1 + 1, 2)
                ndcg += u_dcg / u_idcg

        hit_ratio = hitratio_numer / hitratio_denom
        ndcg = ndcg / len(train_data)
        logger.info(
            '[Test]| Epochs {:3d} | Hit ratio {:02.4f} | NDCG {:05.4f} |'.
            format(epoch, hit_ratio, ndcg))
        return hit_ratio, ndcg
def test_ann():
    """Test ANN model."""
    # Print parameters used for the model
    dh.tab_printer(args, logger)

    # Load word2vec model
    word2idx, embedding_matrix = dh.load_word2vec_matrix(args.word2vec_file)

    # Load data
    logger.info("Loading data...")
    logger.info("Data processing...")
    test_data = dh.load_data_and_labels(args, args.test_file, word2idx)

    # Load ann model
    OPTION = dh._option(pattern=1)
    if OPTION == 'B':
        logger.info("Loading best model...")
        checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR, select_maximum_value=True)
    else:
        logger.info("Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(CPT_DIR)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=args.allow_soft_placement,
            log_device_placement=args.log_device_placement)
        session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            scores = graph.get_operation_by_name("output/scores").outputs[0]
            loss = graph.get_operation_by_name("loss/loss").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = "output/scores"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def,
                                                                            output_node_names.split("|"))
            tf.train.write_graph(output_graph_def, "graph", "graph-ann-{0}.pb".format(MODEL), as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(create_input_data(test_data)), args.batch_size, 1, shuffle=False)

            # Collect the predictions here
            test_counter, test_loss = 0, 0.0
            test_pre_tk = [0.0] * args.topK
            test_rec_tk = [0.0] * args.topK
            test_F1_tk = [0.0] * args.topK

            # Collect the predictions here
            true_labels = []
            predicted_labels = []
            predicted_scores = []

            # Collect for calculating metrics
            true_onehot_labels = []
            predicted_onehot_scores = []
            predicted_onehot_labels_ts = []
            predicted_onehot_labels_tk = [[] for _ in range(args.topK)]

            for batch_test in batches:
                x, y_onehot, y = zip(*batch_test)
                feed_dict = {
                    input_x: x,
                    input_y: y_onehot,
                    dropout_keep_prob: 1.0,
                    is_training: False
                }

                batch_scores, cur_loss = sess.run([scores, loss], feed_dict)

                # Prepare for calculating metrics
                for i in y_onehot:
                    true_onehot_labels.append(i)
                for j in batch_scores:
                    predicted_onehot_scores.append(j)

                # Get the predicted labels by threshold
                batch_predicted_labels_ts, batch_predicted_scores_ts = \
                    dh.get_label_threshold(scores=batch_scores, threshold=args.threshold)

                # Add results to collection
                for i in y:
                    true_labels.append(i)
                for j in batch_predicted_labels_ts:
                    predicted_labels.append(j)
                for k in batch_predicted_scores_ts:
                    predicted_scores.append(k)

                # Get onehot predictions by threshold
                batch_predicted_onehot_labels_ts = \
                    dh.get_onehot_label_threshold(scores=batch_scores, threshold=args.threshold)
                for i in batch_predicted_onehot_labels_ts:
                    predicted_onehot_labels_ts.append(i)

                # Get onehot predictions by topK
                for top_num in range(args.topK):
                    batch_predicted_onehot_labels_tk = dh.get_onehot_label_topk(scores=batch_scores, top_num=top_num+1)

                    for i in batch_predicted_onehot_labels_tk:
                        predicted_onehot_labels_tk[top_num].append(i)

                test_loss = test_loss + cur_loss
                test_counter = test_counter + 1

            # Calculate Precision & Recall & F1
            test_pre_ts = precision_score(y_true=np.array(true_onehot_labels),
                                          y_pred=np.array(predicted_onehot_labels_ts), average='micro')
            test_rec_ts = recall_score(y_true=np.array(true_onehot_labels),
                                       y_pred=np.array(predicted_onehot_labels_ts), average='micro')
            test_F1_ts = f1_score(y_true=np.array(true_onehot_labels),
                                  y_pred=np.array(predicted_onehot_labels_ts), average='micro')

            for top_num in range(args.topK):
                test_pre_tk[top_num] = precision_score(y_true=np.array(true_onehot_labels),
                                                       y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                                                       average='micro')
                test_rec_tk[top_num] = recall_score(y_true=np.array(true_onehot_labels),
                                                    y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                                                    average='micro')
                test_F1_tk[top_num] = f1_score(y_true=np.array(true_onehot_labels),
                                               y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                                               average='micro')

            # Calculate the average AUC
            test_auc = roc_auc_score(y_true=np.array(true_onehot_labels),
                                     y_score=np.array(predicted_onehot_scores), average='micro')

            # Calculate the average PR
            test_prc = average_precision_score(y_true=np.array(true_onehot_labels),
                                               y_score=np.array(predicted_onehot_scores), average="micro")
            test_loss = float(test_loss / test_counter)

            logger.info("All Test Dataset: Loss {0:g} | AUC {1:g} | AUPRC {2:g}"
                        .format(test_loss, test_auc, test_prc))

            # Predict by threshold
            logger.info("Predict by threshold: Precision {0:g}, Recall {1:g}, F1 {2:g}"
                        .format(test_pre_ts, test_rec_ts, test_F1_ts))

            # Predict by topK
            logger.info("Predict by topK:")
            for top_num in range(args.topK):
                logger.info("Top{0}: Precision {1:g}, Recall {2:g}, F1 {3:g}"
                            .format(top_num + 1, test_pre_tk[top_num], test_rec_tk[top_num], test_F1_tk[top_num]))

            # Save the prediction result
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR)
            dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", data_id=test_data['id'],
                                      true_labels=true_labels, predict_labels=predicted_labels,
                                      predict_scores=predicted_scores)

    logger.info("All Done.")
Beispiel #19
0
def test(saved_file):
    # Load data
    logger.info("✔︎ Loading data...")

    logger.info("✔︎ Training data processing...")
    train_data = dh.load_data(Config().TRAININGSET_DIR)

    logger.info("✔︎ Test data processing...")
    test_data = dh.load_data(Config().TESTSET_DIR)

    logger.info("✔︎ Load negative sample...")
    # with open(Config().NEG_SAMPLES, 'rb') as handle:
    #     neg_samples = pickle.load(handle)
    neg_samples = {}

    item_list = [i for i in range(336)]

    # Load model
    MODEL_DIR = dh.load_model_file(saved_file)

    dr_model = torch.load(MODEL_DIR)

    dr_model.eval()

    item_embedding = dr_model.encode.weight
    hidden = dr_model.init_hidden(Config().batch_size)

    hitratio_numer = 0
    hitratio_denom = 0
    hitratio_numer = 0
    hitratio_denom = 0
    hitratio_numer_10 = 0
    hitratio_numer_5 = 0
    ndcg = 0.0
    ndcg_denom = 0
    hitratio_list_5 = []
    hitratio_list_10 = []
    ndcg_list = []

    for i, x in enumerate(
            tqdm(
                dh.batch_iter(train_data,
                              Config().batch_size,
                              Config().seq_len_test,
                              shuffle=False))):
        uids, baskets, lens = x
        dynamic_user, _ = dr_model(baskets, lens, hidden)
        for uid, l, du in zip(uids, lens, dynamic_user):
            scores = []
            du_latest = du[l - 1].unsqueeze(0)

            # Deal with positives samples
            positives = test_data[test_data['userID'] ==
                                  uid].baskets.values[0][:-1]  # list dim 1
            p_length = len(positives)
            positives = torch.LongTensor(positives)
            print("positives:   ", positives)

            # calculating <u,p> score for all test items <u,p> pair
            scores_pos = list(
                torch.mm(du_latest,
                         item_embedding[positives].t()).data.cpu().numpy()[0])
            for s in scores_pos:
                scores.append(s)
            print("score_pos:   ", score_pos)

            # Deal with negative samples
            neg_item_list = list(set(item_list).difference(set(positives)))
            negtives = random.sample(neg_item_list, Config().neg_num)
            negtives = torch.LongTensor(negtives)
            scores_neg = list(
                torch.mm(du_latest,
                         item_embedding[negtives].t()).data.cpu().numpy()[0])
            for s in scores_neg:
                scores.append(s)

            print("scores:   ", scores)

            # Calculate hit-ratio
            index_k = []
            for k in range(Config().top_k):
                index = scores.index(max(scores))
                index_k.append(index)
                scores[index] = -9999
            print("index_k:   ", index_k)
            hr_5_numer = len((set(np.arange(0, p_length)) & set(index_k[0:5])))
            hr_10_numer = len((set(np.arange(0, p_length)) & set(index_k)))
            hitratio_numer_10 += hr_10_numer  # np.arange()产生等差数列
            hitratio_numer_5 += hr_5_numer
            hitratio_denom += p_length
            hitratio_list_5.append(hr_5_numer / p_length)
            hitratio_list_10.append(hr_10_numer / p_length)
            # print("hitratio_list_5:   ", hitratio_list_5)
            # print("hitratio_list_10:   ", hitratio_list_10)
            # hitratio_numer += len((set(np.arange(0, p_length)) & set(index_k)))
            # hitratio_denom += p_length

            # Calculate NDCG
            u_dcg = 0
            u_idcg = 0
            for k in range(Config().top_k):
                if index_k[k] < p_length:  # 长度 p_length 内的为正样本
                    u_dcg += 1 / math.log(k + 1 + 1, 2)
                u_idcg += 1 / math.log(k + 1 + 1, 2)
            ndcg += u_dcg / u_idcg
            ndcg_denom += 1
            ndcg_list.append(u_dcg / u_idcg)
            # print("ndcg_list:   ", ndcg_list)

    hit_ratio_5 = hitratio_numer_5 / hitratio_denom
    hit_ratio_10 = hitratio_numer_10 / hitratio_denom
    ndcg = ndcg / ndcg_denom
    print('Hit ratio@5: {1} | Hit ratio@10: {1}'.format(
        hit_ratio_5, hit_ratio_10))
    print('NDCG[{0}]: {1}'.format(Config().top_k, ndcg))
    return hitratio_list_5, hitratio_list_10, ndcg_list
Beispiel #20
0
def train_mann():
    """Training MANN model."""

    # Load sentences, labels, and training parameters
    logger.info('✔︎ Loading data...')

    logger.info('✔︎ Training data processing...')
    train_data = dh.load_data_and_labels(FLAGS.training_data_file,
                                         FLAGS.num_classes,
                                         FLAGS.embedding_dim)

    logger.info('✔︎ Validation data processing...')
    validation_data = \
        dh.load_data_and_labels(FLAGS.validation_data_file, FLAGS.num_classes, FLAGS.embedding_dim)

    logger.info('Recommended padding Sequence length is: {0}'.format(
        FLAGS.pad_seq_len))

    logger.info('✔︎ Training data padding...')
    x_train, y_train = dh.pad_data(train_data, FLAGS.pad_seq_len)

    logger.info('✔︎ Validation data padding...')
    x_validation, y_validation = dh.pad_data(validation_data,
                                             FLAGS.pad_seq_len)

    # Build vocabulary
    VOCAB_SIZE = dh.load_vocab_size(FLAGS.embedding_dim)
    pretrained_word2vec_matrix = dh.load_word2vec_matrix(
        VOCAB_SIZE, FLAGS.embedding_dim)

    # Build a graph and mann object
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            mann = TextMANN(sequence_length=FLAGS.pad_seq_len,
                            num_classes=FLAGS.num_classes,
                            batch_size=FLAGS.batch_size,
                            vocab_size=VOCAB_SIZE,
                            lstm_hidden_size=FLAGS.lstm_hidden_size,
                            fc_hidden_size=FLAGS.fc_hidden_size,
                            embedding_size=FLAGS.embedding_dim,
                            embedding_type=FLAGS.embedding_type,
                            l2_reg_lambda=FLAGS.l2_reg_lambda,
                            pretrained_embedding=pretrained_word2vec_matrix)

            # Define training procedure
            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                learning_rate = tf.train.exponential_decay(
                    learning_rate=FLAGS.learning_rate,
                    global_step=mann.global_step,
                    decay_steps=FLAGS.decay_steps,
                    decay_rate=FLAGS.decay_rate,
                    staircase=True)
                optimizer = tf.train.AdamOptimizer(learning_rate)
                grads, vars = zip(*optimizer.compute_gradients(mann.loss))
                grads, _ = tf.clip_by_global_norm(grads,
                                                  clip_norm=FLAGS.norm_ratio)
                train_op = optimizer.apply_gradients(
                    zip(grads, vars),
                    global_step=mann.global_step,
                    name="train_op")

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in zip(grads, vars):
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{0}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{0}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            if FLAGS.train_or_restore == 'R':
                MODEL = input(
                    "☛ Please input the checkpoints model you want to restore, "
                    "it should be like(1490175368): "
                )  # The model you want to restore

                while not (MODEL.isdigit() and len(MODEL) == 10):
                    MODEL = input(
                        '✘ The format of your input is illegal, please re-input: '
                    )
                logger.info(
                    '✔︎ The format of your input is legal, now loading to next step...'
                )

                checkpoint_dir = 'runs/' + MODEL + '/checkpoints/'

                out_dir = os.path.abspath(
                    os.path.join(os.path.curdir, "runs", MODEL))
                logger.info("✔︎ Writing to {0}\n".format(out_dir))
            else:
                timestamp = str(int(time.time()))
                out_dir = os.path.abspath(
                    os.path.join(os.path.curdir, "runs", timestamp))
                logger.info("✔︎ Writing to {0}\n".format(out_dir))

            # Summaries for loss
            loss_summary = tf.summary.scalar("loss", mann.loss)

            # Train summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Validation summaries
            validation_summary_op = tf.summary.merge([loss_summary])
            validation_summary_dir = os.path.join(out_dir, "summaries",
                                                  "validation")
            validation_summary_writer = tf.summary.FileWriter(
                validation_summary_dir, sess.graph)

            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            if FLAGS.train_or_restore == 'R':
                # Load mann model
                logger.info("✔ Loading model...")
                checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
                logger.info(checkpoint_file)

                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    "{0}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)
            else:
                checkpoint_dir = os.path.abspath(
                    os.path.join(out_dir, "checkpoints"))
                if not os.path.exists(checkpoint_dir):
                    os.makedirs(checkpoint_dir)
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())

                # Embedding visualization config
                config = projector.ProjectorConfig()
                embedding_conf = config.embeddings.add()
                embedding_conf.tensor_name = 'embedding'
                embedding_conf.metadata_path = FLAGS.metadata_file

                projector.visualize_embeddings(train_summary_writer, config)
                projector.visualize_embeddings(validation_summary_writer,
                                               config)

                # Save the embedding visualization
                saver.save(
                    sess, os.path.join(out_dir, 'embedding', 'embedding.ckpt'))

            current_step = sess.run(mann.global_step)

            def train_step(x_batch, y_batch):
                """A single training step"""
                feed_dict = {
                    mann.input_x: x_batch,
                    mann.input_y: y_batch,
                    mann.dropout_keep_prob: FLAGS.dropout_keep_prob,
                    mann.is_training: True
                }
                _, step, summaries, loss = sess.run(
                    [train_op, mann.global_step, train_summary_op, mann.loss],
                    feed_dict)
                logger.info("step {0}: loss {1:g}".format(step, loss))
                train_summary_writer.add_summary(summaries, step)

            def validation_step(x_validation, y_validation, writer=None):
                """Evaluates model on a validation set"""
                batches_validation = dh.batch_iter(
                    list(zip(x_validation, y_validation)), FLAGS.batch_size, 1)

                # Predict classes by threshold or topk ('ts': threshold; 'tk': topk)
                eval_counter, eval_loss, eval_rec_ts, eval_pre_ts, eval_F_ts = 0, 0.0, 0.0, 0.0, 0.0
                eval_rec_tk = [0.0] * FLAGS.top_num
                eval_pre_tk = [0.0] * FLAGS.top_num
                eval_F_tk = [0.0] * FLAGS.top_num

                for batch_validation in batches_validation:
                    x_batch_validation, y_batch_validation = zip(
                        *batch_validation)
                    feed_dict = {
                        mann.input_x: x_batch_validation,
                        mann.input_y: y_batch_validation,
                        mann.dropout_keep_prob: 1.0,
                        mann.is_training: False
                    }
                    step, summaries, scores, cur_loss = sess.run([
                        mann.global_step, validation_summary_op, mann.scores,
                        mann.loss
                    ], feed_dict)

                    # Predict by threshold
                    predicted_labels_threshold, predicted_values_threshold = \
                        dh.get_label_using_scores_by_threshold(scores=scores, threshold=FLAGS.threshold)

                    cur_rec_ts, cur_pre_ts, cur_F_ts = 0.0, 0.0, 0.0

                    for index, predicted_label_threshold in enumerate(
                            predicted_labels_threshold):
                        rec_inc_ts, pre_inc_ts = dh.cal_metric(
                            predicted_label_threshold,
                            y_batch_validation[index])
                        cur_rec_ts, cur_pre_ts = cur_rec_ts + rec_inc_ts, cur_pre_ts + pre_inc_ts

                    cur_rec_ts = cur_rec_ts / len(y_batch_validation)
                    cur_pre_ts = cur_pre_ts / len(y_batch_validation)

                    cur_F_ts = dh.cal_F(cur_rec_ts, cur_pre_ts)

                    eval_rec_ts, eval_pre_ts = eval_rec_ts + cur_rec_ts, eval_pre_ts + cur_pre_ts

                    # Predict by topK
                    topK_predicted_labels = []
                    for top_num in range(FLAGS.top_num):
                        predicted_labels_topk, predicted_values_topk = \
                            dh.get_label_using_scores_by_topk(scores=scores, top_num=top_num+1)
                        topK_predicted_labels.append(predicted_labels_topk)

                    cur_rec_tk = [0.0] * FLAGS.top_num
                    cur_pre_tk = [0.0] * FLAGS.top_num
                    cur_F_tk = [0.0] * FLAGS.top_num

                    for top_num, predicted_labels_topK in enumerate(
                            topK_predicted_labels):
                        for index, predicted_label_topK in enumerate(
                                predicted_labels_topK):
                            rec_inc_tk, pre_inc_tk = dh.cal_metric(
                                predicted_label_topK,
                                y_batch_validation[index])
                            cur_rec_tk[top_num], cur_pre_tk[top_num] = \
                                cur_rec_tk[top_num] + rec_inc_tk, cur_pre_tk[top_num] + pre_inc_tk

                        cur_rec_tk[top_num] = cur_rec_tk[top_num] / len(
                            y_batch_validation)
                        cur_pre_tk[top_num] = cur_pre_tk[top_num] / len(
                            y_batch_validation)

                        cur_F_tk[top_num] = dh.cal_F(cur_rec_tk[top_num],
                                                     cur_pre_tk[top_num])

                        eval_rec_tk[top_num], eval_pre_tk[top_num] = \
                            eval_rec_tk[top_num] + cur_rec_tk[top_num], eval_pre_tk[top_num] + cur_pre_tk[top_num]

                    eval_loss = eval_loss + cur_loss
                    eval_counter = eval_counter + 1

                    logger.info("✔︎ validation batch {0}: loss {1:g}".format(
                        eval_counter, cur_loss))
                    logger.info(
                        "︎☛ Predict by threshold: recall {0:g}, precision {1:g}, F {2:g}"
                        .format(cur_rec_ts, cur_pre_ts, cur_F_ts))

                    logger.info("︎☛ Predict by topK:")
                    for top_num in range(FLAGS.top_num):
                        logger.info(
                            "Top{0}: recall {1:g}, precision {2:g}, F {3:g}".
                            format(top_num + 1, cur_rec_tk[top_num],
                                   cur_pre_tk[top_num], cur_F_tk[top_num]))

                    if writer:
                        writer.add_summary(summaries, step)

                eval_loss = float(eval_loss / eval_counter)
                eval_rec_ts = float(eval_rec_ts / eval_counter)
                eval_pre_ts = float(eval_pre_ts / eval_counter)
                eval_F_ts = dh.cal_F(eval_rec_ts, eval_pre_ts)

                for top_num in range(FLAGS.top_num):
                    eval_rec_tk[top_num] = float(eval_rec_tk[top_num] /
                                                 eval_counter)
                    eval_pre_tk[top_num] = float(eval_pre_tk[top_num] /
                                                 eval_counter)
                    eval_F_tk[top_num] = dh.cal_F(eval_rec_tk[top_num],
                                                  eval_pre_tk[top_num])

                return eval_loss, eval_rec_ts, eval_pre_ts, eval_F_ts, eval_rec_tk, eval_pre_tk, eval_F_tk

            # Generate batches
            batches_train = dh.batch_iter(list(zip(x_train, y_train)),
                                          FLAGS.batch_size, FLAGS.num_epochs)

            num_batches_per_epoch = int(
                (len(x_train) - 1) / FLAGS.batch_size) + 1

            # Training loop. For each batch...
            for batch_train in batches_train:
                x_batch_train, y_batch_train = zip(*batch_train)
                train_step(x_batch_train, y_batch_train)
                current_step = tf.train.global_step(sess, mann.global_step)

                if current_step % FLAGS.evaluate_every == 0:
                    logger.info("\nEvaluation:")
                    eval_loss, eval_rec_ts, eval_pre_ts, eval_F_ts, eval_rec_tk, eval_pre_tk, eval_F_tk = \
                        validation_step(x_validation, y_validation, writer=validation_summary_writer)

                    logger.info(
                        "All Validation set: Loss {0:g}".format(eval_loss))

                    # Predict by threshold
                    logger.info(
                        "︎☛ Predict by threshold: Recall {0:g}, Precision {1:g}, F {2:g}"
                        .format(eval_rec_ts, eval_pre_ts, eval_F_ts))

                    # Predict by topK
                    logger.info("︎☛ Predict by topK:")
                    for top_num in range(FLAGS.top_num):
                        logger.info(
                            "Top{0}: Recall {1:g}, Precision {2:g}, F {3:g}".
                            format(top_num + 1, eval_rec_tk[top_num],
                                   eval_pre_tk[top_num], eval_F_tk[top_num]))
                if current_step % FLAGS.checkpoint_every == 0:
                    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    logger.info(
                        "✔︎ Saved model checkpoint to {0}\n".format(path))
                if current_step % num_batches_per_epoch == 0:
                    current_epoch = current_step // num_batches_per_epoch
                    logger.info(
                        "✔︎ Epoch {0} has finished!".format(current_epoch))

    logger.info("✔︎ Done.")
def train_hmidp():
    """Training hmdip model."""

    # Load sentences, labels, and training parameters
    logger.info("✔︎ Loading data...")

    logger.info("✔︎ Training data processing...")
    train_data = dh.load_data_and_labels(FLAGS.training_data_file,
                                         FLAGS.embedding_dim,
                                         data_aug_flag=False)

    logger.info("✔︎ Validation data processing...")
    val_data = dh.load_data_and_labels(FLAGS.validation_data_file,
                                       FLAGS.embedding_dim,
                                       data_aug_flag=False)

    logger.info("✔︎ Training data padding...")
    x_train_content, x_train_question, x_train_option, y_train = dh.pad_data(
        train_data, FLAGS.pad_seq_len)

    logger.info("✔︎ Validation data padding...")
    x_val_content, x_val_question, x_val_option, y_val = dh.pad_data(
        val_data, FLAGS.pad_seq_len)

    # Build vocabulary
    VOCAB_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(
        FLAGS.embedding_dim)

    # Build a graph and hmidp object
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            hmidp = TextHMIDP(
                sequence_length=list(map(int, FLAGS.pad_seq_len.split(','))),
                vocab_size=VOCAB_SIZE,
                fc_hidden_size=FLAGS.fc_hidden_size,
                lstm_hidden_size=FLAGS.lstm_hidden_size,
                embedding_size=FLAGS.embedding_dim,
                embedding_type=FLAGS.embedding_type,
                filter_sizes=list(map(int, FLAGS.filter_sizes.split(','))),
                num_filters=list(map(int, FLAGS.num_filters.split(','))),
                pooling_size=FLAGS.pooling_size,
                l2_reg_lambda=FLAGS.l2_reg_lambda,
                pretrained_embedding=pretrained_word2vec_matrix)

            # Define training procedure
            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                learning_rate = tf.train.exponential_decay(
                    learning_rate=FLAGS.learning_rate,
                    global_step=hmidp.global_step,
                    decay_steps=FLAGS.decay_steps,
                    decay_rate=FLAGS.decay_rate,
                    staircase=True)
                optimizer = tf.train.AdamOptimizer(learning_rate)
                grads, vars = zip(*optimizer.compute_gradients(hmidp.loss))
                grads, _ = tf.clip_by_global_norm(grads,
                                                  clip_norm=FLAGS.norm_ratio)
                train_op = optimizer.apply_gradients(
                    zip(grads, vars),
                    global_step=hmidp.global_step,
                    name="train_op")

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in zip(grads, vars):
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{0}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{0}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            if FLAGS.train_or_restore == 'R':
                MODEL = input(
                    "☛ Please input the checkpoints model you want to restore, "
                    "it should be like(1490175368): "
                )  # The model you want to restore

                while not (MODEL.isdigit() and len(MODEL) == 10):
                    MODEL = input(
                        "✘ The format of your input is illegal, please re-input: "
                    )
                logger.info(
                    "✔︎ The format of your input is legal, now loading to next step..."
                )
                out_dir = os.path.abspath(
                    os.path.join(os.path.curdir, "runs", MODEL))
                logger.info("✔︎ Writing to {0}\n".format(out_dir))
            else:
                timestamp = str(int(time.time()))
                out_dir = os.path.abspath(
                    os.path.join(os.path.curdir, "runs", timestamp))
                logger.info("✔︎ Writing to {0}\n".format(out_dir))

            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            best_checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "bestcheckpoints"))

            # Summaries for loss
            loss_summary = tf.summary.scalar("loss", hmidp.loss)

            # Train summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Validation summaries
            validation_summary_op = tf.summary.merge([loss_summary])
            validation_summary_dir = os.path.join(out_dir, "summaries",
                                                  "validation")
            validation_summary_writer = tf.summary.FileWriter(
                validation_summary_dir, sess.graph)

            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)
            best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir,
                                                num_to_keep=3,
                                                maximize=False)

            if FLAGS.train_or_restore == 'R':
                # Load hmidp model
                logger.info("✔︎ Loading model...")
                checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
                logger.info(checkpoint_file)

                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    "{0}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)
            else:
                if not os.path.exists(checkpoint_dir):
                    os.makedirs(checkpoint_dir)
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())

                # Embedding visualization config
                config = projector.ProjectorConfig()
                embedding_conf = config.embeddings.add()
                embedding_conf.tensor_name = "embedding"
                embedding_conf.metadata_path = FLAGS.metadata_file

                projector.visualize_embeddings(train_summary_writer, config)
                projector.visualize_embeddings(validation_summary_writer,
                                               config)

                # Save the embedding visualization
                saver.save(
                    sess, os.path.join(out_dir, "embedding", "embedding.ckpt"))

            current_step = sess.run(hmidp.global_step)

            def train_step(x_batch_content, x_batch_question, x_batch_option,
                           y_batch):
                """A single training step"""
                feed_dict = {
                    hmidp.input_x_content: x_batch_content,
                    hmidp.input_x_question: x_batch_question,
                    hmidp.input_x_option: x_batch_option,
                    hmidp.input_y: y_batch,
                    hmidp.dropout_keep_prob: FLAGS.dropout_keep_prob,
                    hmidp.is_training: True
                }
                _, step, summaries, loss = sess.run([
                    train_op, hmidp.global_step, train_summary_op, hmidp.loss
                ], feed_dict)
                logger.info("step {0}: loss {1:g}".format(step, loss))
                train_summary_writer.add_summary(summaries, step)

            def validation_step(x_val_content,
                                x_val_question,
                                x_val_option,
                                y_val,
                                writer=None):
                """Evaluates model on a validation set"""
                batches_validation = dh.batch_iter(
                    list(
                        zip(x_val_content, x_val_question, x_val_option,
                            y_val)), FLAGS.batch_size, 1)

                eval_counter, eval_loss = 0, 0.0

                true_labels = []
                predicted_scores = []

                for batch_validation in batches_validation:
                    x_batch_content, x_batch_question, x_batch_option, y_batch = zip(
                        *batch_validation)
                    feed_dict = {
                        hmidp.input_x_content: x_batch_content,
                        hmidp.input_x_question: x_batch_question,
                        hmidp.input_x_option: x_batch_option,
                        hmidp.input_y: y_batch,
                        hmidp.dropout_keep_prob: 1.0,
                        hmidp.is_training: False
                    }
                    step, summaries, scores, cur_loss = sess.run([
                        hmidp.global_step, validation_summary_op, hmidp.scores,
                        hmidp.loss
                    ], feed_dict)

                    # Prepare for calculating metrics
                    for i in y_batch:
                        true_labels.append(i)
                    for j in scores:
                        predicted_scores.append(j)

                    eval_loss = eval_loss + cur_loss
                    eval_counter = eval_counter + 1

                    if writer:
                        writer.add_summary(summaries, step)

                eval_loss = float(eval_loss / eval_counter)

                # Calculate PCC & DOA
                pcc, doa = dh.evaluation(true_labels, predicted_scores)
                # Calculate RMSE
                rmse = mean_squared_error(true_labels, predicted_scores)**0.5

                return eval_loss, pcc, doa, rmse

            # Generate batches
            batches_train = dh.batch_iter(
                list(
                    zip(x_train_content, x_train_question, x_train_option,
                        y_train)), FLAGS.batch_size, FLAGS.num_epochs)

            num_batches_per_epoch = int(
                (len(y_train) - 1) / FLAGS.batch_size) + 1

            # Training loop. For each batch...
            for batch_train in batches_train:
                x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train = zip(
                    *batch_train)
                train_step(x_batch_train_content, x_batch_train_question,
                           x_batch_train_option, y_batch_train)
                current_step = tf.train.global_step(sess, hmidp.global_step)

                if current_step % FLAGS.evaluate_every == 0:
                    logger.info("\nEvaluation:")
                    eval_loss, pcc, doa, rmse = validation_step(
                        x_val_content,
                        x_val_question,
                        x_val_option,
                        y_val,
                        writer=validation_summary_writer)
                    logger.info(
                        "All Validation set: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g}"
                        .format(eval_loss, pcc, doa, rmse))
                    best_saver.handle(rmse, sess, current_step)
                if current_step % FLAGS.checkpoint_every == 0:
                    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    logger.info(
                        "✔︎ Saved model checkpoint to {0}\n".format(path))
                if current_step % num_batches_per_epoch == 0:
                    current_epoch = current_step // num_batches_per_epoch
                    logger.info(
                        "✔︎ Epoch {0} has finished!".format(current_epoch))

    logger.info("✔︎ Done.")
Beispiel #22
0
def test_cnn():
    """Test CNN model."""

    # Load data
    logger.info("✔︎ Loading data...")
    logger.info("Recommended padding Sequence length is: {0}".format(FLAGS.pad_seq_len))

    logger.info("✔︎ Test data processing...")
    test_data = dh.load_data_and_labels(FLAGS.test_data_file, FLAGS.num_classes,
                                        FLAGS.embedding_dim, data_aug_flag=False)

    logger.info("✔︎ Test data padding...")
    x_test, y_test = dh.pad_data(test_data, FLAGS.pad_seq_len)
    y_test_labels = test_data.labels

    # Load cnn model
    BEST_OR_LATEST = input("☛ Load Best or Latest Model?(B/L): ")

    while not (BEST_OR_LATEST.isalpha() and BEST_OR_LATEST.upper() in ['B', 'L']):
        BEST_OR_LATEST = input("✘ The format of your input is illegal, please re-input: ")
    if BEST_OR_LATEST == 'B':
        logger.info("✔︎ Loading best model...")
        checkpoint_file = cm.get_best_checkpoint(FLAGS.best_checkpoint_dir, select_maximum_value=True)
    else:
        logger.info("✔︎ Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            scores = graph.get_operation_by_name("output/scores").outputs[0]
            loss = graph.get_operation_by_name("loss/loss").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = "output/logits|output/scores"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def,
                                                                            output_node_names.split("|"))
            tf.train.write_graph(output_graph_def, "graph", "graph-cnn-{0}.pb".format(MODEL), as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(zip(x_test, y_test, y_test_labels)), FLAGS.batch_size, 1, shuffle=False)

            # Collect the predictions here
            all_labels = []
            all_predicted_labels = []
            all_predicted_values = []

            # Calculate the metric
            test_counter, test_loss, test_rec, test_pre, test_F = 0, 0.0, 0.0, 0.0, 0.0

            for batch_test in batches:
                x_batch_test, y_batch_test, y_batch_test_labels = zip(*batch_test)
                feed_dict = {
                    input_x: x_batch_test,
                    input_y: y_batch_test,
                    dropout_keep_prob: 1.0,
                    is_training: False
                }
                batch_scores, cur_loss = sess.run([scores, loss], feed_dict)

                # Predict by threshold
                predicted_labels_threshold, predicted_values_threshold = \
                    dh.get_label_using_scores_by_threshold(scores=batch_scores, threshold=FLAGS.threshold)

                cur_rec, cur_pre, cur_F = 0.0, 0.0, 0.0

                for index, predicted_label_threshold in enumerate(predicted_labels_threshold):
                    rec_inc, pre_inc = dh.cal_metric(predicted_label_threshold, y_batch_test[index])
                    cur_rec, cur_pre = cur_rec + rec_inc, cur_pre + pre_inc

                cur_rec = cur_rec / len(y_batch_test)
                cur_pre = cur_pre / len(y_batch_test)

                test_rec, test_pre = test_rec + cur_rec, test_pre + cur_pre

                # Add results to collection
                for item in y_batch_test_labels:
                    all_labels.append(item)
                for item in predicted_labels_threshold:
                    all_predicted_labels.append(item)
                for item in predicted_values_threshold:
                    all_predicted_values.append(item)

                test_loss = test_loss + cur_loss
                test_counter = test_counter + 1

            test_loss = float(test_loss / test_counter)
            test_rec = float(test_rec / test_counter)
            test_pre = float(test_pre / test_counter)
            test_F = dh.cal_F(test_rec, test_pre)

            logger.info("☛ All Test Dataset: Loss {0:g}".format(test_loss))

            # Predict by threshold
            logger.info("☛ Predict by threshold: Recall {0:g}, Precision {1:g}, F {2:g}"
                        .format(test_rec, test_pre, test_F))
            # Save the prediction result
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR)
            dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", data_id=test_data.testid,
                                      all_labels=all_labels, all_predict_labels=all_predicted_labels,
                                      all_predict_values=all_predicted_values)

    logger.info("✔︎ Done.")
Beispiel #23
0
def test_mann():
    """Test MANN model."""

    # Load data
    logger.info("✔ Loading data...")
    logger.info('Recommended padding Sequence length is: {0}'.format(FLAGS.pad_seq_len))

    logger.info('✔︎ Test data processing...')
    test_data = dh.load_data_and_labels(FLAGS.test_data_file, FLAGS.num_classes, FLAGS.embedding_dim)

    logger.info('✔︎ Test data padding...')
    x_test, y_test = dh.pad_data(test_data, FLAGS.pad_seq_len)

    # Load mann model
    logger.info("✔ Loading model...")
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            scores = graph.get_operation_by_name("output/scores").outputs[0]
            loss = graph.get_operation_by_name("loss/loss").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = 'output/logits|output/scores'

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def,
                                                                            output_node_names.split("|"))
            tf.train.write_graph(output_graph_def, 'graph', 'graph-mann-{0}.pb'.format(MODEL), as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(zip(x_test, y_test)), FLAGS.batch_size, 1, shuffle=False)

            # Collect the predictions here
            all_predicted_label_ts = []
            all_predicted_values_ts = []

            all_predicted_label_tk = []
            all_predicted_values_tk = []

            # Calculate the metric
            test_counter, test_loss, test_rec_ts, test_acc_ts, test_F_ts = 0, 0.0, 0.0, 0.0, 0.0
            test_rec_tk = [0.0] * FLAGS.top_num
            test_acc_tk = [0.0] * FLAGS.top_num
            test_F_tk = [0.0] * FLAGS.top_num

            for batch_test in batches:
                x_batch_test, y_batch_test = zip(*batch_test)
                feed_dict = {
                    input_x: x_batch_test,
                    input_y: y_batch_test,
                    dropout_keep_prob: 1.0,
                    is_training: False
                }
                batch_scores, cur_loss = sess.run([scores, loss], feed_dict)

                # Predict by threshold
                predicted_labels_threshold, predicted_values_threshold = \
                    dh.get_label_using_scores_by_threshold(scores=batch_scores, threshold=FLAGS.threshold)

                cur_rec_ts, cur_acc_ts, cur_F_ts = 0.0, 0.0, 0.0

                for index, predicted_label_threshold in enumerate(predicted_labels_threshold):
                    rec_inc_ts, acc_inc_ts, F_inc_ts = dh.cal_metric(predicted_label_threshold,
                                                                     y_batch_test[index])
                    cur_rec_ts, cur_acc_ts, cur_F_ts = cur_rec_ts + rec_inc_ts, \
                                                       cur_acc_ts + acc_inc_ts, \
                                                       cur_F_ts + F_inc_ts

                cur_rec_ts = cur_rec_ts / len(y_batch_test)
                cur_acc_ts = cur_acc_ts / len(y_batch_test)
                cur_F_ts = cur_F_ts / len(y_batch_test)

                test_rec_ts, test_acc_ts, test_F_ts = test_rec_ts + cur_rec_ts, \
                                                      test_acc_ts + cur_acc_ts, \
                                                      test_F_ts + cur_F_ts

                # Add results to collection
                for item in predicted_labels_threshold:
                    all_predicted_label_ts.append(item)
                for item in predicted_values_threshold:
                    all_predicted_values_ts.append(item)

                # Predict by topK
                topK_predicted_labels = []
                for top_num in range(FLAGS.top_num):
                    predicted_labels_topk, predicted_values_topk = \
                        dh.get_label_using_scores_by_topk(batch_scores, top_num=top_num + 1)
                    topK_predicted_labels.append(predicted_labels_topk)

                cur_rec_tk = [0.0] * FLAGS.top_num
                cur_acc_tk = [0.0] * FLAGS.top_num
                cur_F_tk = [0.0] * FLAGS.top_num

                for top_num, predicted_labels_topK in enumerate(topK_predicted_labels):
                    for index, predicted_label_topK in enumerate(predicted_labels_topK):
                        rec_inc_tk, acc_inc_tk, F_inc_tk = dh.cal_metric(predicted_label_topK,
                                                                         y_batch_test[index])
                        cur_rec_tk[top_num], cur_acc_tk[top_num], cur_F_tk[top_num] = \
                            cur_rec_tk[top_num] + rec_inc_tk, \
                            cur_acc_tk[top_num] + acc_inc_tk, \
                            cur_F_tk[top_num] + F_inc_tk

                    cur_rec_tk[top_num] = cur_rec_tk[top_num] / len(y_batch_test)
                    cur_acc_tk[top_num] = cur_acc_tk[top_num] / len(y_batch_test)
                    cur_F_tk[top_num] = cur_F_tk[top_num] / len(y_batch_test)

                    test_rec_tk[top_num], test_acc_tk[top_num], test_F_tk[top_num] = \
                        test_rec_tk[top_num] + cur_rec_tk[top_num], \
                        test_acc_tk[top_num] + cur_acc_tk[top_num], \
                        test_F_tk[top_num] + cur_F_tk[top_num]

                test_loss = test_loss + cur_loss
                test_counter = test_counter + 1

            test_loss = float(test_loss / test_counter)
            test_rec_ts = float(test_rec_ts / test_counter)
            test_acc_ts = float(test_acc_ts / test_counter)
            test_F_ts = float(test_F_ts / test_counter)

            for top_num in range(FLAGS.top_num):
                test_rec_tk[top_num] = float(test_rec_tk[top_num] / test_counter)
                test_acc_tk[top_num] = float(test_acc_tk[top_num] / test_counter)
                test_F_tk[top_num] = float(test_F_tk[top_num] / test_counter)

            logger.info("☛ All Test Dataset: Loss {0:g}".format(test_loss))

            # Predict by threshold
            logger.info("︎☛ Predict by threshold: Recall {0:g}, accuracy {1:g}, F {2:g}"
                        .format(test_rec_ts, test_acc_ts, test_F_ts))

            # Predict by topK
            logger.info("︎☛ Predict by topK:")
            for top_num in range(FLAGS.top_num):
                logger.info("Top{0}: recall {1:g}, accuracy {2:g}, F {3:g}"
                            .format(top_num + 1, test_rec_tk[top_num], test_acc_tk[top_num], test_F_tk[top_num]))

            # Save the prediction result
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR)
            dh.create_prediction_file(output_file=SAVE_DIR + '/predictions.json', data_id=test_data.testid,
                                      all_predict_labels_ts=all_predicted_label_ts,
                                      all_predict_values_ts=all_predicted_values_ts)

    logger.info("✔ Done.")
Beispiel #24
0
            def validation_step(x_batch_front,
                                x_batch_behind,
                                y_batch,
                                writer=None):
                """Evaluates model on a validation set"""
                batches_validation = dh.batch_iter(
                    list(zip(x_batch_front, x_batch_behind, y_batch)),
                    args.batch_size, 1)
                eval_counter, eval_loss = 0, 0.0
                true_labels = []
                predicted_scores = []
                predicted_labels = []

                for batch_validation in batches_validation:
                    x_batch_val_front, x_batch_val_behind, y_batch_val = zip(
                        *batch_validation)
                    feed_dict = {
                        ann.input_x_front: x_batch_val_front,
                        ann.input_x_behind: x_batch_val_behind,
                        ann.input_y: y_batch_val,
                        ann.dropout_keep_prob: 1.0,
                        ann.is_training: False
                    }
                    step, summaries, scores, predictions, cur_loss = sess.run([
                        ann.global_step, validation_summary_op, ann.topKPreds,
                        ann.predictions, ann.loss
                    ], feed_dict)

                    # Prepare for calculating metrics
                    for i in y_batch_val:
                        true_labels.append(np.argmax(i))
                    for j in scores[0]:
                        predicted_scores.append(j[0])
                    for k in predictions:
                        predicted_labels.append(k)

                    eval_loss = eval_loss + cur_loss
                    eval_counter = eval_counter + 1

                    if writer:
                        writer.add_summary(summaries, step)

                eval_loss = float(eval_loss / eval_counter)

                # Calculate Precision & Recall & F1
                eval_acc = accuracy_score(y_true=np.array(true_labels),
                                          y_pred=np.array(predicted_labels))
                eval_pre = precision_score(y_true=np.array(true_labels),
                                           y_pred=np.array(predicted_labels),
                                           average='micro')
                eval_rec = recall_score(y_true=np.array(true_labels),
                                        y_pred=np.array(predicted_labels),
                                        average='micro')
                eval_F1 = f1_score(y_true=np.array(true_labels),
                                   y_pred=np.array(predicted_labels),
                                   average='micro')

                # Calculate the average AUC
                eval_auc = roc_auc_score(y_true=np.array(true_labels),
                                         y_score=np.array(predicted_scores),
                                         average='micro')

                return eval_loss, eval_acc, eval_pre, eval_rec, eval_F1, eval_auc
            def validation_step(x_val, y_val, writer=None):
                """Evaluates model on a validation set"""
                batches_validation = dh.batch_iter(list(zip(x_val, y_val)),
                                                   FLAGS.batch_size, 1)

                # Predict classes by threshold or topk ('ts': threshold; 'tk': topk)
                eval_counter, eval_loss = 0, 0.0

                eval_pre_tk = [0.0] * FLAGS.top_num
                eval_rec_tk = [0.0] * FLAGS.top_num
                eval_F_tk = [0.0] * FLAGS.top_num

                true_onehot_labels = []
                predicted_onehot_scores = []
                predicted_onehot_labels_ts = []
                predicted_onehot_labels_tk = [[] for _ in range(FLAGS.top_num)]

                for batch_validation in batches_validation:
                    x_batch_val, y_batch_val = zip(*batch_validation)
                    feed_dict = {
                        rcnn.input_x: x_batch_val,
                        rcnn.input_y: y_batch_val,
                        rcnn.dropout_keep_prob: 1.0,
                        rcnn.is_training: False
                    }
                    step, summaries, scores, cur_loss = sess.run([
                        rcnn.global_step, validation_summary_op, rcnn.scores,
                        rcnn.loss
                    ], feed_dict)

                    # Prepare for calculating metrics
                    for i in y_batch_val:
                        true_onehot_labels.append(i)
                    for j in scores:
                        predicted_onehot_scores.append(j)

                    # Predict by threshold
                    batch_predicted_onehot_labels_ts = \
                        dh.get_onehot_label_threshold(scores=scores, threshold=FLAGS.threshold)

                    for k in batch_predicted_onehot_labels_ts:
                        predicted_onehot_labels_ts.append(k)

                    # Predict by topK
                    for top_num in range(FLAGS.top_num):
                        batch_predicted_onehot_labels_tk = dh.get_onehot_label_topk(
                            scores=scores, top_num=top_num + 1)

                        for i in batch_predicted_onehot_labels_tk:
                            predicted_onehot_labels_tk[top_num].append(i)

                    eval_loss = eval_loss + cur_loss
                    eval_counter = eval_counter + 1

                    if writer:
                        writer.add_summary(summaries, step)

                eval_loss = float(eval_loss / eval_counter)

                # Calculate Precision & Recall & F1 (threshold & topK)
                eval_pre_ts = precision_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_ts),
                    average='micro')
                eval_rec_ts = recall_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_ts),
                    average='micro')
                eval_F_ts = f1_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_ts),
                    average='micro')

                for top_num in range(FLAGS.top_num):
                    eval_pre_tk[top_num] = precision_score(
                        y_true=np.array(true_onehot_labels),
                        y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                        average='micro')
                    eval_rec_tk[top_num] = recall_score(
                        y_true=np.array(true_onehot_labels),
                        y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                        average='micro')
                    eval_F_tk[top_num] = f1_score(
                        y_true=np.array(true_onehot_labels),
                        y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                        average='micro')

                # Calculate the average AUC
                eval_auc = roc_auc_score(
                    y_true=np.array(true_onehot_labels),
                    y_score=np.array(predicted_onehot_scores),
                    average='micro')
                # Calculate the average PR
                eval_prc = average_precision_score(
                    y_true=np.array(true_onehot_labels),
                    y_score=np.array(predicted_onehot_scores),
                    average='micro')

                return eval_loss, eval_auc, eval_prc, eval_rec_ts, eval_pre_ts, eval_F_ts, \
                       eval_rec_tk, eval_pre_tk, eval_F_tk
def train_ann():
    """Training ANN model."""
    # Print parameters used for the model
    dh.tab_printer(args, logger)

    # Load word2vec model
    word2idx, embedding_matrix = dh.load_word2vec_matrix(args.word2vec_file)

    # Load sentences, labels, and training parameters
    logger.info("Loading data...")
    logger.info("Data processing...")
    train_data = dh.load_data_and_labels(args, args.train_file, word2idx)
    val_data = dh.load_data_and_labels(args, args.validation_file, word2idx)

    # Build a graph and ann object
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=args.allow_soft_placement,
            log_device_placement=args.log_device_placement)
        session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            ann = TextANN(sequence_length=args.pad_seq_len,
                          vocab_size=len(word2idx),
                          embedding_type=args.embedding_type,
                          embedding_size=args.embedding_dim,
                          fc_hidden_size=args.fc_dim,
                          num_classes=args.num_classes,
                          l2_reg_lambda=args.l2_lambda,
                          pretrained_embedding=embedding_matrix)

            # Define training procedure
            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                learning_rate = tf.train.exponential_decay(
                    learning_rate=args.learning_rate,
                    global_step=ann.global_step,
                    decay_steps=args.decay_steps,
                    decay_rate=args.decay_rate,
                    staircase=True)
                optimizer = tf.train.AdamOptimizer(learning_rate)
                grads, vars = zip(*optimizer.compute_gradients(ann.loss))
                grads, _ = tf.clip_by_global_norm(grads,
                                                  clip_norm=args.norm_ratio)
                train_op = optimizer.apply_gradients(
                    zip(grads, vars),
                    global_step=ann.global_step,
                    name="train_op")

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in zip(grads, vars):
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{0}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{0}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            out_dir = dh.get_out_dir(OPTION, logger)
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            best_checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "bestcheckpoints"))

            # Summaries for loss
            loss_summary = tf.summary.scalar("loss", ann.loss)

            # Train summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Validation summaries
            validation_summary_op = tf.summary.merge([loss_summary])
            validation_summary_dir = os.path.join(out_dir, "summaries",
                                                  "validation")
            validation_summary_writer = tf.summary.FileWriter(
                validation_summary_dir, sess.graph)

            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=args.num_checkpoints)
            best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir,
                                                num_to_keep=3,
                                                maximize=True)

            if OPTION == 'R':
                # Load ann model
                logger.info("Loading model...")
                checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
                logger.info(checkpoint_file)

                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    "{0}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)
            if OPTION == 'T':
                if not os.path.exists(checkpoint_dir):
                    os.makedirs(checkpoint_dir)
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())

                # Embedding visualization config
                config = projector.ProjectorConfig()
                embedding_conf = config.embeddings.add()
                embedding_conf.tensor_name = "embedding"
                embedding_conf.metadata_path = args.metadata_file

                projector.visualize_embeddings(train_summary_writer, config)
                projector.visualize_embeddings(validation_summary_writer,
                                               config)

                # Save the embedding visualization
                saver.save(
                    sess, os.path.join(out_dir, "embedding", "embedding.ckpt"))

            current_step = sess.run(ann.global_step)

            def train_step(batch_data):
                """A single training step"""
                x, y_onehot = zip(*batch_data)

                feed_dict = {
                    ann.input_x: x,
                    ann.input_y: y_onehot,
                    ann.dropout_keep_prob: args.dropout_rate,
                    ann.is_training: True
                }
                _, step, summaries, loss = sess.run(
                    [train_op, ann.global_step, train_summary_op, ann.loss],
                    feed_dict)
                logger.info("step {0}: loss {1:g}".format(step, loss))
                train_summary_writer.add_summary(summaries, step)

            def validation_step(val_loader, writer=None):
                """Evaluates model on a validation set"""
                batches_validation = dh.batch_iter(
                    list(create_input_data(val_loader)), args.batch_size, 1)

                # Predict classes by threshold or topk ('ts': threshold; 'tk': topk)
                eval_counter, eval_loss = 0, 0.0
                eval_pre_tk = [0.0] * args.topK
                eval_rec_tk = [0.0] * args.topK
                eval_F1_tk = [0.0] * args.topK

                true_onehot_labels = []
                predicted_onehot_scores = []
                predicted_onehot_labels_ts = []
                predicted_onehot_labels_tk = [[] for _ in range(args.topK)]

                for batch_validation in batches_validation:
                    x, y_onehot = zip(*batch_validation)
                    feed_dict = {
                        ann.input_x: x,
                        ann.input_y: y_onehot,
                        ann.dropout_keep_prob: 1.0,
                        ann.is_training: False
                    }
                    step, summaries, scores, cur_loss = sess.run([
                        ann.global_step, validation_summary_op, ann.scores,
                        ann.loss
                    ], feed_dict)

                    # Prepare for calculating metrics
                    for i in y_onehot:
                        true_onehot_labels.append(i)
                    for j in scores:
                        predicted_onehot_scores.append(j)

                    # Predict by threshold
                    batch_predicted_onehot_labels_ts = \
                        dh.get_onehot_label_threshold(scores=scores, threshold=args.threshold)
                    for k in batch_predicted_onehot_labels_ts:
                        predicted_onehot_labels_ts.append(k)

                    # Predict by topK
                    for top_num in range(args.topK):
                        batch_predicted_onehot_labels_tk = dh.get_onehot_label_topk(
                            scores=scores, top_num=top_num + 1)
                        for i in batch_predicted_onehot_labels_tk:
                            predicted_onehot_labels_tk[top_num].append(i)

                    eval_loss = eval_loss + cur_loss
                    eval_counter = eval_counter + 1

                    if writer:
                        writer.add_summary(summaries, step)

                eval_loss = float(eval_loss / eval_counter)

                # Calculate Precision & Recall & F1
                eval_pre_ts = precision_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_ts),
                    average='micro')
                eval_rec_ts = recall_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_ts),
                    average='micro')
                eval_F1_ts = f1_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_ts),
                    average='micro')

                for top_num in range(args.topK):
                    eval_pre_tk[top_num] = precision_score(
                        y_true=np.array(true_onehot_labels),
                        y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                        average='micro')
                    eval_rec_tk[top_num] = recall_score(
                        y_true=np.array(true_onehot_labels),
                        y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                        average='micro')
                    eval_F1_tk[top_num] = f1_score(
                        y_true=np.array(true_onehot_labels),
                        y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                        average='micro')

                # Calculate the average AUC
                eval_auc = roc_auc_score(
                    y_true=np.array(true_onehot_labels),
                    y_score=np.array(predicted_onehot_scores),
                    average='micro')
                # Calculate the average PR
                eval_prc = average_precision_score(
                    y_true=np.array(true_onehot_labels),
                    y_score=np.array(predicted_onehot_scores),
                    average='micro')

                return eval_loss, eval_auc, eval_prc, eval_pre_ts, eval_rec_ts, eval_F1_ts, \
                       eval_pre_tk, eval_rec_tk, eval_F1_tk

            # Generate batches
            batches_train = dh.batch_iter(list(create_input_data(train_data)),
                                          args.batch_size, args.epochs)
            num_batches_per_epoch = int(
                (len(train_data['pad_seqs']) - 1) / args.batch_size) + 1

            # Training loop. For each batch...
            for batch_train in batches_train:
                train_step(batch_train)
                current_step = tf.train.global_step(sess, ann.global_step)

                if current_step % args.evaluate_steps == 0:
                    logger.info("\nEvaluation:")
                    eval_loss, eval_auc, eval_prc, \
                    eval_pre_ts, eval_rec_ts, eval_F1_ts, eval_pre_tk, eval_rec_tk, eval_F1_tk = \
                        validation_step(val_data, writer=validation_summary_writer)
                    logger.info(
                        "All Validation set: Loss {0:g} | AUC {1:g} | AUPRC {2:g}"
                        .format(eval_loss, eval_auc, eval_prc))
                    # Predict by threshold
                    logger.info(
                        "Predict by threshold: Precision {0:g}, Recall {1:g}, F1 {2:g}"
                        .format(eval_pre_ts, eval_rec_ts, eval_F1_ts))
                    # Predict by topK
                    logger.info("Predict by topK:")
                    for top_num in range(args.topK):
                        logger.info(
                            "Top{0}: Precision {1:g}, Recall {2:g}, F1 {3:g}".
                            format(top_num + 1, eval_pre_tk[top_num],
                                   eval_rec_tk[top_num], eval_F1_tk[top_num]))
                    best_saver.handle(eval_prc, sess, current_step)
                if current_step % args.checkpoint_steps == 0:
                    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    logger.info("Saved model checkpoint to {0}\n".format(path))
                if current_step % num_batches_per_epoch == 0:
                    current_epoch = current_step // num_batches_per_epoch
                    logger.info(
                        "Epoch {0} has finished!".format(current_epoch))

    logger.info("All Done.")
def train_abcnn():
    """Training ABCNN model."""
    # Print parameters used for the model
    dh.tab_printer(args, logger)

    # Load sentences, labels, and training parameters
    logger.info("Loading data...")
    logger.info("Data processing...")
    train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file)
    validation_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file)

    logger.info("Data padding...")
    x_train_front, x_train_behind, y_train = dh.pad_data(train_data, args.pad_seq_len)
    x_validation_front, x_validation_behind, y_validation = dh.pad_data(validation_data, args.pad_seq_len)

    # Build vocabulary
    VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)

    # Build a graph and abcnn object
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=args.allow_soft_placement,
            log_device_placement=args.log_device_placement)
        session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            abcnn = TextABCNN(
                sequence_length=args.pad_seq_len,
                vocab_size=VOCAB_SIZE,
                embedding_type=args.embedding_type,
                embedding_size=EMBEDDING_SIZE,
                filter_sizes=args.filter_sizes,
                num_filters=args.num_filters,
                fc_hidden_size=args.fc_dim,
                num_classes=y_train.shape[1],
                l2_reg_lambda=args.l2_lambda,
                pretrained_embedding=pretrained_word2vec_matrix)

            # Define training procedure
            with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                learning_rate = tf.train.exponential_decay(learning_rate=args.learning_rate,
                                                           global_step=abcnn.global_step, decay_steps=args.decay_steps,
                                                           decay_rate=args.decay_rate, staircase=True)
                optimizer = tf.train.AdamOptimizer(learning_rate)
                grads, vars = zip(*optimizer.compute_gradients(abcnn.loss))
                grads, _ = tf.clip_by_global_norm(grads, clip_norm=args.norm_ratio)
                train_op = optimizer.apply_gradients(zip(grads, vars), global_step=abcnn.global_step, name="train_op")

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in zip(grads, vars):
                if g is not None:
                    grad_hist_summary = tf.summary.histogram("{0}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar("{0}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            out_dir = dh.get_out_dir(OPTION, logger)
            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            best_checkpoint_dir = os.path.abspath(os.path.join(out_dir, "bestcheckpoints"))

            # Summaries for loss
            loss_summary = tf.summary.scalar("loss", abcnn.loss)

            # Train summaries
            train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

            # Validation summaries
            validation_summary_op = tf.summary.merge([loss_summary])
            validation_summary_dir = os.path.join(out_dir, "summaries", "validation")
            validation_summary_writer = tf.summary.FileWriter(validation_summary_dir, sess.graph)

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=args.num_checkpoints)
            best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir, num_to_keep=3, maximize=True)

            if OPTION == 'R':
                # Load abcnn model
                logger.info("Loading model...")
                checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
                logger.info(checkpoint_file)

                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)
            if OPTION == 'T':
                if not os.path.exists(checkpoint_dir):
                    os.makedirs(checkpoint_dir)
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())

                # Embedding visualization config
                config = projector.ProjectorConfig()
                embedding_conf = config.embeddings.add()
                embedding_conf.tensor_name = "embedding"
                embedding_conf.metadata_path = args.metadata_file

                projector.visualize_embeddings(train_summary_writer, config)
                projector.visualize_embeddings(validation_summary_writer, config)

                # Save the embedding visualization
                saver.save(sess, os.path.join(out_dir, "embedding", "embedding.ckpt"))

            current_step = sess.run(abcnn.global_step)

            def train_step(x_batch_front, x_batch_behind, y_batch):
                """A single training step"""
                feed_dict = {
                    abcnn.input_x_front: x_batch_front,
                    abcnn.input_x_behind: x_batch_behind,
                    abcnn.input_y: y_batch,
                    abcnn.dropout_keep_prob: args.dropout_rate,
                    abcnn.is_training: True
                }
                _, step, summaries, loss = sess.run(
                    [train_op, abcnn.global_step, train_summary_op, abcnn.loss], feed_dict)
                logger.info("step {0}: loss {1:g}".format(step, loss))
                train_summary_writer.add_summary(summaries, step)

            def validation_step(x_batch_front, x_batch_behind, y_batch, writer=None):
                """Evaluates model on a validation set"""
                batches_validation = dh.batch_iter(list(zip(x_batch_front, x_batch_behind, y_batch)),
                                                   args.batch_size, 1)
                eval_counter, eval_loss = 0, 0.0
                true_labels = []
                predicted_scores = []
                predicted_labels = []

                for batch_validation in batches_validation:
                    x_batch_val_front, x_batch_val_behind, y_batch_val = zip(*batch_validation)
                    feed_dict = {
                        abcnn.input_x_front: x_batch_val_front,
                        abcnn.input_x_behind: x_batch_val_behind,
                        abcnn.input_y: y_batch_val,
                        abcnn.dropout_keep_prob: 1.0,
                        abcnn.is_training: False
                    }
                    step, summaries, scores, predictions, cur_loss = sess.run(
                        [abcnn.global_step, validation_summary_op,
                         abcnn.topKPreds, abcnn.predictions, abcnn.loss], feed_dict)

                    # Prepare for calculating metrics
                    for i in y_batch_val:
                        true_labels.append(np.argmax(i))
                    for j in scores[0]:
                        predicted_scores.append(j[0])
                    for k in predictions:
                        predicted_labels.append(k)

                    eval_loss = eval_loss + cur_loss
                    eval_counter = eval_counter + 1

                if writer:
                    writer.add_summary(summaries, step)

                eval_loss = float(eval_loss / eval_counter)

                # Calculate Precision & Recall & F1
                eval_acc = accuracy_score(y_true=np.array(true_labels), y_pred=np.array(predicted_labels))
                eval_pre = precision_score(y_true=np.array(true_labels),
                                           y_pred=np.array(predicted_labels), average='micro')
                eval_rec = recall_score(y_true=np.array(true_labels),
                                        y_pred=np.array(predicted_labels), average='micro')
                eval_F1 = f1_score(y_true=np.array(true_labels),
                                   y_pred=np.array(predicted_labels), average='micro')

                # Calculate the average AUC
                eval_auc = roc_auc_score(y_true=np.array(true_labels),
                                         y_score=np.array(predicted_scores), average='micro')

                return eval_loss, eval_acc, eval_pre, eval_rec, eval_F1, eval_auc

            # Generate batches
            batches_train = dh.batch_iter(
                list(zip(x_train_front, x_train_behind, y_train)), args.batch_size, args.epochs)

            num_batches_per_epoch = int((len(x_train_front) - 1) / args.batch_size) + 1

            # Training loop. For each batch...
            for batch_train in batches_train:
                x_batch_front, x_batch_behind, y_batch = zip(*batch_train)
                train_step(x_batch_front, x_batch_behind, y_batch)
                current_step = tf.train.global_step(sess, abcnn.global_step)

                if current_step % args.evaluate_steps == 0:
                    logger.info("\nEvaluation:")
                    eval_loss, eval_acc, eval_pre, eval_rec, eval_F1, eval_auc = \
                        validation_step(x_validation_front, x_validation_behind,
                                        y_validation, writer=validation_summary_writer)
                    logger.info("All Validation set: Loss {0:g} | Acc {1:g} | Precision {2:g} | "
                                "Recall {3:g} | F1 {4:g} | AUC {5:g}"
                                .format(eval_loss, eval_acc, eval_pre, eval_rec, eval_F1, eval_auc))
                    best_saver.handle(eval_acc, sess, current_step)
                if current_step % args.checkpoint_steps == 0:
                    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
                    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                    logger.info("Saved model checkpoint to {0}\n".format(path))
                if current_step % num_batches_per_epoch == 0:
                    current_epoch = current_step // num_batches_per_epoch
                    logger.info("Epoch {0} has finished!".format(current_epoch))

    logger.info("All Done.")
Beispiel #28
0
            def validation_step(x_validation, y_validation, writer=None):
                """Evaluates model on a validation set"""
                batches_validation = dh.batch_iter(
                    list(zip(x_validation, y_validation)), FLAGS.batch_size, 1)

                # Predict classes by threshold or topk ('ts': threshold; 'tk': topk)
                eval_counter, eval_loss, eval_rec_ts, eval_pre_ts, eval_F_ts = 0, 0.0, 0.0, 0.0, 0.0
                eval_rec_tk = [0.0] * FLAGS.top_num
                eval_pre_tk = [0.0] * FLAGS.top_num
                eval_F_tk = [0.0] * FLAGS.top_num

                for batch_validation in batches_validation:
                    x_batch_validation, y_batch_validation = zip(
                        *batch_validation)
                    feed_dict = {
                        rcnn.input_x: x_batch_validation,
                        rcnn.input_y: y_batch_validation,
                        rcnn.dropout_keep_prob: 1.0,
                        rcnn.is_training: False
                    }
                    step, summaries, scores, cur_loss = sess.run([
                        rcnn.global_step, validation_summary_op, rcnn.scores,
                        rcnn.loss
                    ], feed_dict)

                    # Predict by threshold
                    predicted_labels_threshold, predicted_values_threshold = \
                        dh.get_label_using_scores_by_threshold(scores=scores, threshold=FLAGS.threshold)

                    cur_rec_ts, cur_pre_ts, cur_F_ts = 0.0, 0.0, 0.0

                    for index, predicted_label_threshold in enumerate(
                            predicted_labels_threshold):
                        rec_inc_ts, pre_inc_ts = dh.cal_metric(
                            predicted_label_threshold,
                            y_batch_validation[index])
                        cur_rec_ts, cur_pre_ts = cur_rec_ts + rec_inc_ts, cur_pre_ts + pre_inc_ts

                    cur_rec_ts = cur_rec_ts / len(y_batch_validation)
                    cur_pre_ts = cur_pre_ts / len(y_batch_validation)

                    cur_F_ts = dh.cal_F(cur_rec_ts, cur_pre_ts)

                    eval_rec_ts, eval_pre_ts = eval_rec_ts + cur_rec_ts, eval_pre_ts + cur_pre_ts

                    # Predict by topK
                    topK_predicted_labels = []
                    for top_num in range(FLAGS.top_num):
                        predicted_labels_topk, predicted_values_topk = \
                            dh.get_label_using_scores_by_topk(scores=scores, top_num=top_num+1)
                        topK_predicted_labels.append(predicted_labels_topk)

                    cur_rec_tk = [0.0] * FLAGS.top_num
                    cur_pre_tk = [0.0] * FLAGS.top_num
                    cur_F_tk = [0.0] * FLAGS.top_num

                    for top_num, predicted_labels_topK in enumerate(
                            topK_predicted_labels):
                        for index, predicted_label_topK in enumerate(
                                predicted_labels_topK):
                            rec_inc_tk, pre_inc_tk = dh.cal_metric(
                                predicted_label_topK,
                                y_batch_validation[index])
                            cur_rec_tk[top_num], cur_pre_tk[top_num] = \
                                cur_rec_tk[top_num] + rec_inc_tk, cur_pre_tk[top_num] + pre_inc_tk

                        cur_rec_tk[top_num] = cur_rec_tk[top_num] / len(
                            y_batch_validation)
                        cur_pre_tk[top_num] = cur_pre_tk[top_num] / len(
                            y_batch_validation)

                        cur_F_tk[top_num] = dh.cal_F(cur_rec_tk[top_num],
                                                     cur_pre_tk[top_num])

                        eval_rec_tk[top_num], eval_pre_tk[top_num] = \
                            eval_rec_tk[top_num] + cur_rec_tk[top_num], eval_pre_tk[top_num] + cur_pre_tk[top_num]

                    eval_loss = eval_loss + cur_loss
                    eval_counter = eval_counter + 1

                    if writer:
                        writer.add_summary(summaries, step)

                eval_loss = float(eval_loss / eval_counter)
                eval_rec_ts = float(eval_rec_ts / eval_counter)
                eval_pre_ts = float(eval_pre_ts / eval_counter)
                eval_F_ts = dh.cal_F(eval_rec_ts, eval_pre_ts)

                for top_num in range(FLAGS.top_num):
                    eval_rec_tk[top_num] = float(eval_rec_tk[top_num] /
                                                 eval_counter)
                    eval_pre_tk[top_num] = float(eval_pre_tk[top_num] /
                                                 eval_counter)
                    eval_F_tk[top_num] = dh.cal_F(eval_rec_tk[top_num],
                                                  eval_pre_tk[top_num])

                return eval_loss, eval_rec_ts, eval_pre_ts, eval_F_ts, eval_rec_tk, eval_pre_tk, eval_F_tk
Beispiel #29
0
def train_ann():
    """Training ANN model."""

    # Load sentences, labels, and training parameters
    logger.info('✔︎ Loading data...')

    logger.info('✔︎ Training data processing...')
    train_data = dh.load_data_and_labels(FLAGS.training_data_file,
                                         FLAGS.embedding_dim)

    logger.info('✔︎ Validation data processing...')
    validation_data = dh.load_data_and_labels(FLAGS.validation_data_file,
                                              FLAGS.embedding_dim)

    logger.info('Recommended padding Sequence length is: {0}'.format(
        FLAGS.pad_seq_len))

    logger.info('✔︎ Training data padding...')
    x_train_front, x_train_behind, y_train = dh.pad_data(
        train_data, FLAGS.pad_seq_len)

    logger.info('✔︎ Validation data padding...')
    x_validation_front, x_validation_behind, y_validation = dh.pad_data(
        validation_data, FLAGS.pad_seq_len)

    # Build vocabulary
    VOCAB_SIZE = dh.load_vocab_size(FLAGS.embedding_dim)
    pretrained_word2vec_matrix = dh.load_word2vec_matrix(
        VOCAB_SIZE, FLAGS.embedding_dim)

    # Build a graph and ann object
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            ann = TextANN(sequence_length=FLAGS.pad_seq_len,
                          num_classes=y_train.shape[1],
                          vocab_size=VOCAB_SIZE,
                          fc_hidden_size=FLAGS.fc_hidden_size,
                          embedding_size=FLAGS.embedding_dim,
                          embedding_type=FLAGS.embedding_type,
                          l2_reg_lambda=FLAGS.l2_reg_lambda,
                          pretrained_embedding=pretrained_word2vec_matrix)

            # Define training procedure
            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                learning_rate = tf.train.exponential_decay(
                    learning_rate=FLAGS.learning_rate,
                    global_step=ann.global_step,
                    decay_steps=FLAGS.decay_steps,
                    decay_rate=FLAGS.decay_rate,
                    staircase=True)
                optimizer = tf.train.AdamOptimizer(learning_rate)
                grads, vars = zip(*optimizer.compute_gradients(ann.loss))
                grads, _ = tf.clip_by_global_norm(grads,
                                                  clip_norm=FLAGS.norm_ratio)
                train_op = optimizer.apply_gradients(
                    zip(grads, vars),
                    global_step=ann.global_step,
                    name="train_op")

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in zip(grads, vars):
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{0}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{0}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            if FLAGS.train_or_restore == 'R':
                MODEL = input(
                    "☛ Please input the checkpoints model you want to restore, "
                    "it should be like(1490175368): "
                )  # The model you want to restore

                while not (MODEL.isdigit() and len(MODEL) == 10):
                    MODEL = input(
                        '✘ The format of your input is illegal, please re-input: '
                    )
                logger.info(
                    '✔︎ The format of your input is legal, now loading to next step...'
                )

                checkpoint_dir = 'runs/' + MODEL + '/checkpoints/'

                out_dir = os.path.abspath(
                    os.path.join(os.path.curdir, "runs", MODEL))
                logger.info("✔︎ Writing to {0}\n".format(out_dir))
            else:
                timestamp = str(int(time.time()))
                out_dir = os.path.abspath(
                    os.path.join(os.path.curdir, "runs", timestamp))
                logger.info("✔︎ Writing to {0}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", ann.loss)
            acc_summary = tf.summary.scalar("accuracy", ann.accuracy)

            # Train summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Validation summaries
            validation_summary_op = tf.summary.merge(
                [loss_summary, acc_summary])
            validation_summary_dir = os.path.join(out_dir, "summaries",
                                                  "validation")
            validation_summary_writer = tf.summary.FileWriter(
                validation_summary_dir, sess.graph)

            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            if FLAGS.train_or_restore == 'R':
                # Load ann model
                logger.info("✔ Loading model...")
                checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
                logger.info(checkpoint_file)

                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    "{0}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)
            else:
                checkpoint_dir = os.path.abspath(
                    os.path.join(out_dir, "checkpoints"))
                if not os.path.exists(checkpoint_dir):
                    os.makedirs(checkpoint_dir)
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())

                # Embedding visualization config
                config = projector.ProjectorConfig()
                embedding_conf = config.embeddings.add()
                embedding_conf.tensor_name = 'embedding'
                embedding_conf.metadata_path = FLAGS.metadata_file

                projector.visualize_embeddings(train_summary_writer, config)
                projector.visualize_embeddings(validation_summary_writer,
                                               config)

                # Save the embedding visualization
                saver.save(
                    sess, os.path.join(out_dir, 'embedding', 'embedding.ckpt'))

            current_step = sess.run(ann.global_step)

            def train_step(x_batch_front, x_batch_behind, y_batch):
                """A single training step"""
                feed_dict = {
                    ann.input_x_front: x_batch_front,
                    ann.input_x_behind: x_batch_behind,
                    ann.input_y: y_batch,
                    ann.dropout_keep_prob: FLAGS.dropout_keep_prob,
                    ann.is_training: True
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, ann.global_step, train_summary_op, ann.loss,
                    ann.accuracy
                ], feed_dict)
                logger.info("step {0}: loss {1:g}, acc {2:g}".format(
                    step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)

            def validation_step(x_batch_front,
                                x_batch_behind,
                                y_batch,
                                writer=None):
                """Evaluates model on a validation set"""
                feed_dict = {
                    ann.input_x_front: x_batch_front,
                    ann.input_x_behind: x_batch_behind,
                    ann.input_y: y_batch,
                    ann.dropout_keep_prob: 1.0,
                    ann.is_training: False
                }
                step, summaries, loss, accuracy, recall, precision, f1, auc = sess.run(
                    [
                        ann.global_step, validation_summary_op, ann.loss,
                        ann.accuracy, ann.recall, ann.precision, ann.F1,
                        ann.AUC
                    ], feed_dict)
                logger.info(
                    "step {0}: loss {1:g}, acc {2:g}, recall {3:g}, precision {4:g}, f1 {5:g}, AUC {6}"
                    .format(step, loss, accuracy, recall, precision, f1, auc))
                if writer:
                    writer.add_summary(summaries, step)

            # Generate batches
            batches = dh.batch_iter(
                list(zip(x_train_front, x_train_behind, y_train)),
                FLAGS.batch_size, FLAGS.num_epochs)

            num_batches_per_epoch = int(
                (len(x_train_front) - 1) / FLAGS.batch_size) + 1

            # Training loop. For each batch...
            for batch in batches:
                x_batch_front, x_batch_behind, y_batch = zip(*batch)
                train_step(x_batch_front, x_batch_behind, y_batch)
                current_step = tf.train.global_step(sess, ann.global_step)

                if current_step % FLAGS.evaluate_every == 0:
                    logger.info("\nEvaluation:")
                    validation_step(x_validation_front,
                                    x_validation_behind,
                                    y_validation,
                                    writer=validation_summary_writer)
                if current_step % FLAGS.checkpoint_every == 0:
                    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    logger.info(
                        "✔︎ Saved model checkpoint to {0}\n".format(path))
                if current_step % num_batches_per_epoch == 0:
                    current_epoch = current_step // num_batches_per_epoch
                    logger.info(
                        "✔︎ Epoch {0} has finished!".format(current_epoch))

    logger.info("✔︎ Done.")
Beispiel #30
0
def visualize():
    """Visualize HARNN model."""

    # Load data
    logger.info("Loading data...")
    logger.info("Data processing...")
    test_data = dh.load_data_and_labels(args.test_file,
                                        args.num_classes_list,
                                        args.total_classes,
                                        args.word2vec_file,
                                        data_aug_flag=False)

    logger.info("Data padding...")
    x_test, y_test, y_test_tuple = dh.pad_data(test_data, args.pad_seq_len)
    x_test_content, y_test_labels = test_data.abstract_content, test_data.labels

    # Load harnn model
    OPTION = dh._option(pattern=1)
    if OPTION == 'B':
        logger.info("Loading best model...")
        checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR,
                                                 select_maximum_value=True)
    else:
        logger.info("Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(CPT_DIR)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=args.allow_soft_placement,
            log_device_placement=args.log_device_placement)
        session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            input_y_first = graph.get_operation_by_name(
                "input_y_first").outputs[0]
            input_y_second = graph.get_operation_by_name(
                "input_y_second").outputs[0]
            input_y_third = graph.get_operation_by_name(
                "input_y_third").outputs[0]
            input_y_fourth = graph.get_operation_by_name(
                "input_y_fourth").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            alpha = graph.get_operation_by_name("alpha").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            first_visual = graph.get_operation_by_name(
                "first-output/visual").outputs[0]
            second_visual = graph.get_operation_by_name(
                "second-output/visual").outputs[0]
            third_visual = graph.get_operation_by_name(
                "third-output/visual").outputs[0]
            fourth_visual = graph.get_operation_by_name(
                "fourth-output/visual").outputs[0]
            scores = graph.get_operation_by_name("output/scores").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = "first-output/visual|second-output/visual|third-output/visual|fourth-output/visual|output/scores"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(
                sess, sess.graph_def, output_node_names.split("|"))
            tf.train.write_graph(output_graph_def,
                                 "graph",
                                 "graph-harnn-{0}.pb".format(MODEL),
                                 as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(
                zip(x_test, y_test, y_test_tuple, x_test_content,
                    y_test_labels)),
                                    args.batch_size,
                                    1,
                                    shuffle=False)

            for batch_test in batches:
                x_batch_test, y_batch_test, y_batch_test_tuple, \
                x_batch_test_content, y_batch_test_labels = zip(*batch_test)

                y_batch_test_first = [i[0] for i in y_batch_test_tuple]
                y_batch_test_second = [j[1] for j in y_batch_test_tuple]
                y_batch_test_third = [k[2] for k in y_batch_test_tuple]
                y_batch_test_fourth = [t[3] for t in y_batch_test_tuple]

                feed_dict = {
                    input_x: x_batch_test,
                    input_y_first: y_batch_test_first,
                    input_y_second: y_batch_test_second,
                    input_y_third: y_batch_test_third,
                    input_y_fourth: y_batch_test_fourth,
                    input_y: y_batch_test,
                    dropout_keep_prob: 1.0,
                    alpha: args.alpha,
                    is_training: False
                }
                batch_first_visual, batch_second_visual, batch_third_visual, batch_fourth_visual = \
                    sess.run([first_visual, second_visual, third_visual, fourth_visual], feed_dict)

                seq_len = len(x_batch_test_content[0])
                pad_len = len(batch_first_visual[0])
                length = (pad_len if seq_len >= pad_len else seq_len)

                # print(seq_len, pad_len, length)
                final_first_visual = normalization(
                    batch_first_visual[0].tolist(), length)
                final_second_visual = normalization(
                    batch_second_visual[0].tolist(), length)
                final_third_visual = normalization(
                    batch_third_visual[0].tolist(), length)
                final_fourth_visual = normalization(
                    batch_fourth_visual[0].tolist(), length)

                visual_list = [
                    final_first_visual, final_second_visual,
                    final_third_visual, final_fourth_visual
                ]
                print(visual_list)

                f = open('attention.html', 'w')
                f.write(
                    '<html style="margin:0;padding:0;"><body style="margin:0;padding:0;">\n'
                )
                f.write('<div style="margin:25px;">\n')
                for k in range(len(visual_list)):
                    f.write('<p style="margin:10px;">\n')
                    for i in range(seq_len):
                        alpha = "{:.2f}".format(visual_list[k][i])
                        word = x_batch_test_content[0][i]
                        f.write(
                            '\t<span style="margin-left:3px;background-color:rgba(255,0,0,{0})">{1}</span>\n'
                            .format(alpha, word))
                    f.write('</p>\n')
                f.write('</div>\n')
                f.write('</body></html>')
                f.close()

    logger.info("Done.")