Esempio n. 1
0
def prepare():
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    print('Vocab processing ...')
    q1, q2, y = get_q2q_label(args.merged_files)
    start_time = time.time()
    vocab_processor = tc.learn.preprocessing.VocabularyProcessor(
        max_document_length=args.max_q_len,
        min_frequency=5,
        tokenizer_fn=chinese_tokenizer)
    q1_pad = np.array(list(vocab_processor.fit_transform(q1)))
    q2_pad = np.array(list(vocab_processor.fit_transform(q2)))

    del q1, q1_pad, q2, q2_pad, y

    print('Vocab size: {}'.format(len(vocab_processor.vocabulary_)))
    vocab_processor.save(os.path.join(args.save_dir, "vocab"))

    # split
    # split_data(args.merged_files, os.path.join(args.save_dir, "vocab"), args.pkl_files, mask=True)
    # no mask
    split_data(args.merged_files, os.path.join(args.save_dir, "vocab"),
               args.pkl_files)

    time_dif = get_time_dif(start_time)
    print('Vocab processing time usage:', time_dif)
Esempio n. 2
0
def predict():
    print('Loading test data ...')
    start_time = time.time()
    q1_test, q2_test, y_test = get_q2q_label(args.test_data_files)

    vocab_path = os.path.join(args.save_dir, 'vocab')
    vocab_processor = tc.learn.preprocessing.VocabularyProcessor.restore(
        vocab_path)

    # MVLSTM model init
    model = MVLSTM(sequence_length=args.max_q_len,
                   num_classes=args.num_classes,
                   embedding_dim=args.embedding_dim,
                   vocab_size=len(vocab_processor.vocabulary_),
                   max_length=args.max_q_len,
                   hidden_dim=args.hidden_size,
                   learning_rate=args.learning_rate)

    q1_pad = np.array(list(vocab_processor.transform(q1_test)))
    q2_pad = np.array(list(vocab_processor.transform(q2_test)))

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(session, save_path=save_path)

    print('Testing ...')
    loss_test, acc_test = evaluate(q1_pad,
                                   q2_pad,
                                   y_test,
                                   session,
                                   model=model)
    print('Test loss:{0:>6.2}, Test acc:{1:7.2%}'.format(loss_test, acc_test))

    test_batches = batch_iter_per_epoch(q1_pad, q2_pad, y_test, shuffle=False)
    all_predictions = []
    all_predict_prob = []
    count = 0  # concatenate第一次不能为空,需要一个判断来赋all_predict_prob
    for q1_test_batch, q2_test_batch, y_test_batch in test_batches:
        batch_predictions, batch_predict_probs = session.run(
            [model.y_pred, model.probs],
            feed_dict={
                model.input_q1: q1_test_batch,
                model.input_q2: q2_test_batch,
                model.dropout_keep_prob: 1.0
            })
        all_predictions = np.concatenate([all_predictions, batch_predictions])
        if count == 0:
            all_predict_prob = batch_predict_probs
        else:
            all_predict_prob = np.concatenate(
                [all_predict_prob, batch_predict_probs])
        count = 1

    y_test = [float(temp) for temp in y_test]
    # Evaluation indices
    print('Precision, Recall, F1-Score ...')
    print(
        metrics.classification_report(y_test,
                                      all_predictions,
                                      target_names=['not match', 'match']))

    # Confusion Matrix
    print('Confusion Matrix ...')
    print(metrics.confusion_matrix(y_test, all_predictions))

    # Write probability to csv
    out_dir = os.path.join(args.save_dir, 'predict_prob_csv')
    print('Saving evaluation to {0}'.format(out_dir))
    with open(out_dir, 'w') as f:
        csv.writer(f).writerows(all_predict_prob)

    time_dif = get_time_dif(start_time)
    print('Time usage:', time_dif)