Esempio n. 1
0
def test_idf_build():
    questions_train, vocabulary, idf = load_questions_from_file("train", q_limit['train'])
    questions_validate, vocabulary, idf = load_questions_from_file("validate", q_limit['validate'])
    questions_test, vocabulary, idf = load_questions_from_file("test", q_limit['test'])
    X_train, y_train = load_data(train_data, globals.nn_features_file, globals.nn_labels_file)
    #X_validate, y_validate = load_data(validate_data, globals.nn_features_file, globals.nn_labels_file)
    #X_test, y_test = load_data(test_data, globals.nn_features_file, globals.nn_labels_file)

    y_train_flatten = list(itertools.chain(*y_train))

    build_idf([questions_train, questions_validate, questions_test])
    samples = decompose_questions(questions_train)

    print("Samples len: %d, y_train_flatten len: %d" % (len(samples), len(y_train_flatten)))

    #samples = samples[:3]

    #pprint(samples[:40])

    e = 0

    print("%s" % "wo".ljust(10, " ") + " " + "wo_idf".ljust(30, " ") + " " + "q_len".ljust(10, " ")
          + " " + "wo_answers".ljust(30, " ") + " " + "idf_avg".ljust(30, " ")
          + " " + "idf_norm".ljust(30, " ") + " " + "label")
    for idx, sample in enumerate(samples):
        print("%s, %s" % (str(sample[0]).ljust(10, " ") + " " + str(sample[1]).ljust(30, " ")
                          + " " + str(sample[2]).ljust(10, " ") + " " + str(sample[3]).ljust(30, " ")
                          + " " + str(sample[4]).ljust(30, " ") + " " + str(sample[5]).ljust(30, " ")
                          + " ", y_train_flatten[e]))
        e += 1
def test_idf_build():
    questions_train, vocabulary, idf = load_questions_from_file(
        "train", q_limit['train'])
    questions_validate, vocabulary, idf = load_questions_from_file(
        "validate", q_limit['validate'])
    questions_test, vocabulary, idf = load_questions_from_file(
        "test", q_limit['test'])
    X_train, y_train = load_data(train_data, globals.nn_features_file,
                                 globals.nn_labels_file)
    #X_validate, y_validate = load_data(validate_data, globals.nn_features_file, globals.nn_labels_file)
    #X_test, y_test = load_data(test_data, globals.nn_features_file, globals.nn_labels_file)

    y_train_flatten = list(itertools.chain(*y_train))

    build_idf([questions_train, questions_validate, questions_test])
    samples = decompose_questions(questions_train)

    print("Samples len: %d, y_train_flatten len: %d" %
          (len(samples), len(y_train_flatten)))

    #samples = samples[:3]

    #pprint(samples[:40])

    e = 0

    print("%s" % "wo".ljust(10, " ") + " " + "wo_idf".ljust(30, " ") + " " +
          "q_len".ljust(10, " ") + " " + "wo_answers".ljust(30, " ") + " " +
          "idf_avg".ljust(30, " ") + " " + "idf_norm".ljust(30, " ") + " " +
          "label")
    for idx, sample in enumerate(samples):
        print("%s, %s" %
              (str(sample[0]).ljust(10, " ") + " " + str(sample[1]).ljust(
                  30, " ") + " " + str(sample[2]).ljust(10, " ") + " " +
               str(sample[3]).ljust(30, " ") + " " +
               str(sample[4]).ljust(30, " ") + " " +
               str(sample[5]).ljust(30, " ") + " ", y_train_flatten[e]))
        e += 1
Esempio n. 3
0
import logging
from os.path import exists
from os import makedirs
from generate_input_file import load_questions_from_file
from collections import Counter

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    questions = {}
    questions['train'], voc, idf = load_questions_from_file('train', -1)
    questions['validate'], voc, idf = load_questions_from_file('validate', -1)
    questions['test'], voc, idf = load_questions_from_file('test', -1)
    logging.info("Questions loaded.")

    logging.info("Working on train...")

    n_answers = Counter()
    q_with_answer = 0
    q_with_answers = 0
    for split in ['train','validate','test']:
        for question in questions[split]:
            if len(question.correct_answer) > 1:
                q_with_answers += 1
            elif len(question.correct_answer) == 1:
                q_with_answer += 1

            n_answers[len(question.answers)] += 1

    print('distribution of number of answers',n_answers)
    print("For all, 1 answer: %d, many answers: %d" % (q_with_answer, q_with_answers))
Esempio n. 4
0
import logging
from os.path import exists
from os import makedirs
from generate_input_file import load_questions_from_file
from collections import Counter

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    questions = {}
    questions['train'], voc, idf = load_questions_from_file('train', -1)
    questions['validate'], voc, idf = load_questions_from_file('validate', -1)
    questions['test'], voc, idf = load_questions_from_file('test', -1)
    logging.info("Questions loaded.")

    logging.info("Working on train...")

    n_answers = Counter()
    q_with_answer = 0
    q_with_answers = 0
    for split in ['train', 'validate', 'test']:
        for question in questions[split]:
            if len(question.correct_answer) > 1:
                q_with_answers += 1
            elif len(question.correct_answer) == 1:
                q_with_answer += 1

            n_answers[len(question.answers)] += 1

    print('distribution of number of answers', n_answers)
    print("For all, 1 answer: %d, many answers: %d" %
          (q_with_answer, q_with_answers))
def train_and_test(X_train, y_train, X_validate, y_validate, X_test, y_test):
    model = cnn.get_cnn("regular")

    y_train_flatted = list(itertools.chain(*y_train))
    nb_batch = len(X_train) / batch_size + 1

    best_f1 = 0.0
    best_f1_index = 0

    # Load questions and build idf for them
    questions_train, v, idf = load_questions_from_file("train",
                                                       q_limit['train'])
    questions_validate, v, idf = load_questions_from_file(
        "validate", q_limit['validate'])
    questions_test, v, idf = load_questions_from_file("test", q_limit['test'])

    build_idf([questions_train, questions_validate, questions_test])

    # Create samples for loaded questions
    samples_train = decompose_questions(questions_train)
    samples_validate = decompose_questions(questions_validate)
    samples_test = decompose_questions(questions_test)

    print("len of X_train: %d and samples_train: %d" %
          (len(X_train), len(samples_train)))

    for e in range(nb_epoch):
        print("Epoch %d" % e)
        progress_bar = generic_utils.Progbar(X_train.shape[0])

        # For training NN, shuffle the data
        X_train_shuffled, y_train_shuffled = shuffle_set(
            X_train, y_train_flatted)

        # Train for number of batches
        for i in range(nb_batch):
            train_loss, train_accuracy = model.train_on_batch(
                X_train_shuffled[i * batch_size:(i + 1) * batch_size],
                y_train_shuffled[i * batch_size:(i + 1) * batch_size],
                accuracy=True)
            progress_bar.add(batch_size,
                             values=[("train loss", train_loss),
                                     ("train accuracy:", train_accuracy)])

        # Check the score on the validation data
        #results_val = test_model(model, X_validate, y_validate)
        #best_threshold = find_threshold(y_validate, results_val["y_predicted_scores"], results_val["y_predicted_scores"])
        #precision_val, recall_val, f1_val = evaluate_with_threshold(y_validate, results_val["y_predicted_scores"],
        #                                                            results_val["y_predicted_scores"],
        #                                                            best_threshold)

        # Check the score on the test data
        #results_test = test_model(model, X_test, y_test)
        #precision_test, recall_test, f1_test = evaluate_with_threshold(y_test, results_test["y_predicted_scores"],
        #                                                               results_test["y_predicted_scores"],
        #                                                               best_threshold)

        #nn_string = "NN tests:\n" + "Threshold".ljust(40, ".") + " %.4f" + "\nOver validation set\n" \
        #            + "validation loss, validation acc".ljust(40, ".") + " %.4f %.4f\n" \
        #            + "precision, recall, f1".ljust(40, ".") + " %.4f %.4f %.4f\n" \
        #            + "Over test set\n" \
        #            + "test loss, test acc".ljust(40, ".") + " %.4f %.4f\n" \
        #            + "precision, recall, f1".ljust(40, ".") + " %.4f %.4f %.4f\n" \

        #globals.logger.info(nn_string % (best_threshold, results_val['test_loss'], results_val['test_accuracy'],
        #                                 precision_val, recall_val, f1_val,
        #                                 results_test['test_loss'], results_test['test_accuracy'],
        #                                 precision_test, recall_test, f1_test))

        # Now evaluate with logistic regression
        # Get predictions from NN
        predictions_train = model.predict(X_train)
        predictions_validate = model.predict(X_validate)
        predictions_test = model.predict(X_test)

        # Evaluate on logistic regression
        precision, recall, f1 = validate_on_lr(samples_train, samples_validate,
                                               samples_test, predictions_train,
                                               predictions_validate,
                                               predictions_test, y_train,
                                               y_validate, y_test)

        lr_string = "LR tests:\n" + "Over test set\n" \
                    + "precision, recall, f1".ljust(40, ".") + " %.4f %.4f %.4f\n"

        globals.logger.info(lr_string % (precision, recall, f1))

        if f1 >= best_f1:
            best_f1 = f1
            best_f1_index = e

    globals.logger.info(
        "Training done, best f1 on logistic regression is: %.4f for epoch nr: %d"
        % (best_f1, best_f1_index))
Esempio n. 6
0
def train_and_test(X_train, y_train, X_validate, y_validate, X_test, y_test):
    model = cnn.get_cnn("regular")

    y_train_flatted = list(itertools.chain(*y_train))
    nb_batch = len(X_train)/batch_size + 1

    best_f1 = 0.0
    best_f1_index = 0

    # Load questions and build idf for them
    questions_train, v, idf = load_questions_from_file("train", q_limit['train'])
    questions_validate, v, idf = load_questions_from_file("validate", q_limit['validate'])
    questions_test, v, idf = load_questions_from_file("test", q_limit['test'])

    build_idf([questions_train, questions_validate, questions_test])

    # Create samples for loaded questions
    samples_train = decompose_questions(questions_train)
    samples_validate = decompose_questions(questions_validate)
    samples_test = decompose_questions(questions_test)

    print("len of X_train: %d and samples_train: %d" % (len(X_train), len(samples_train)))

    for e in range(nb_epoch):
        print("Epoch %d" % e)
        progress_bar = generic_utils.Progbar(X_train.shape[0])

        # For training NN, shuffle the data
        X_train_shuffled, y_train_shuffled = shuffle_set(X_train, y_train_flatted)

        # Train for number of batches
        for i in range(nb_batch):
            train_loss,train_accuracy = model.train_on_batch(X_train_shuffled[i*batch_size:(i+1)*batch_size],
                                                             y_train_shuffled[i*batch_size:(i+1)*batch_size],
                                                             accuracy=True)
            progress_bar.add(batch_size, values=[("train loss", train_loss),("train accuracy:", train_accuracy)])

        # Check the score on the validation data
        #results_val = test_model(model, X_validate, y_validate)
        #best_threshold = find_threshold(y_validate, results_val["y_predicted_scores"], results_val["y_predicted_scores"])
        #precision_val, recall_val, f1_val = evaluate_with_threshold(y_validate, results_val["y_predicted_scores"],
        #                                                            results_val["y_predicted_scores"],
        #                                                            best_threshold)

        # Check the score on the test data
        #results_test = test_model(model, X_test, y_test)
        #precision_test, recall_test, f1_test = evaluate_with_threshold(y_test, results_test["y_predicted_scores"],
        #                                                               results_test["y_predicted_scores"],
        #                                                               best_threshold)

        #nn_string = "NN tests:\n" + "Threshold".ljust(40, ".") + " %.4f" + "\nOver validation set\n" \
        #            + "validation loss, validation acc".ljust(40, ".") + " %.4f %.4f\n" \
        #            + "precision, recall, f1".ljust(40, ".") + " %.4f %.4f %.4f\n" \
        #            + "Over test set\n" \
        #            + "test loss, test acc".ljust(40, ".") + " %.4f %.4f\n" \
        #            + "precision, recall, f1".ljust(40, ".") + " %.4f %.4f %.4f\n" \

        #globals.logger.info(nn_string % (best_threshold, results_val['test_loss'], results_val['test_accuracy'],
        #                                 precision_val, recall_val, f1_val,
        #                                 results_test['test_loss'], results_test['test_accuracy'],
        #                                 precision_test, recall_test, f1_test))

        # Now evaluate with logistic regression
        # Get predictions from NN
        predictions_train = model.predict(X_train)
        predictions_validate = model.predict(X_validate)
        predictions_test = model.predict(X_test)

        # Evaluate on logistic regression
        precision, recall, f1 = validate_on_lr(samples_train, samples_validate, samples_test,
                                               predictions_train, predictions_validate, predictions_test,
                                               y_train, y_validate, y_test)

        lr_string = "LR tests:\n" + "Over test set\n" \
                    + "precision, recall, f1".ljust(40, ".") + " %.4f %.4f %.4f\n"

        globals.logger.info(lr_string % (precision, recall, f1))

        if f1 >= best_f1:
            best_f1 = f1
            best_f1_index = e

    globals.logger.info("Training done, best f1 on logistic regression is: %.4f for epoch nr: %d" %
                   (best_f1, best_f1_index))