def test_idf_build(): questions_train, vocabulary, idf = load_questions_from_file("train", q_limit['train']) questions_validate, vocabulary, idf = load_questions_from_file("validate", q_limit['validate']) questions_test, vocabulary, idf = load_questions_from_file("test", q_limit['test']) X_train, y_train = load_data(train_data, globals.nn_features_file, globals.nn_labels_file) #X_validate, y_validate = load_data(validate_data, globals.nn_features_file, globals.nn_labels_file) #X_test, y_test = load_data(test_data, globals.nn_features_file, globals.nn_labels_file) y_train_flatten = list(itertools.chain(*y_train)) build_idf([questions_train, questions_validate, questions_test]) samples = decompose_questions(questions_train) print("Samples len: %d, y_train_flatten len: %d" % (len(samples), len(y_train_flatten))) #samples = samples[:3] #pprint(samples[:40]) e = 0 print("%s" % "wo".ljust(10, " ") + " " + "wo_idf".ljust(30, " ") + " " + "q_len".ljust(10, " ") + " " + "wo_answers".ljust(30, " ") + " " + "idf_avg".ljust(30, " ") + " " + "idf_norm".ljust(30, " ") + " " + "label") for idx, sample in enumerate(samples): print("%s, %s" % (str(sample[0]).ljust(10, " ") + " " + str(sample[1]).ljust(30, " ") + " " + str(sample[2]).ljust(10, " ") + " " + str(sample[3]).ljust(30, " ") + " " + str(sample[4]).ljust(30, " ") + " " + str(sample[5]).ljust(30, " ") + " ", y_train_flatten[e])) e += 1
def test_idf_build(): questions_train, vocabulary, idf = load_questions_from_file( "train", q_limit['train']) questions_validate, vocabulary, idf = load_questions_from_file( "validate", q_limit['validate']) questions_test, vocabulary, idf = load_questions_from_file( "test", q_limit['test']) X_train, y_train = load_data(train_data, globals.nn_features_file, globals.nn_labels_file) #X_validate, y_validate = load_data(validate_data, globals.nn_features_file, globals.nn_labels_file) #X_test, y_test = load_data(test_data, globals.nn_features_file, globals.nn_labels_file) y_train_flatten = list(itertools.chain(*y_train)) build_idf([questions_train, questions_validate, questions_test]) samples = decompose_questions(questions_train) print("Samples len: %d, y_train_flatten len: %d" % (len(samples), len(y_train_flatten))) #samples = samples[:3] #pprint(samples[:40]) e = 0 print("%s" % "wo".ljust(10, " ") + " " + "wo_idf".ljust(30, " ") + " " + "q_len".ljust(10, " ") + " " + "wo_answers".ljust(30, " ") + " " + "idf_avg".ljust(30, " ") + " " + "idf_norm".ljust(30, " ") + " " + "label") for idx, sample in enumerate(samples): print("%s, %s" % (str(sample[0]).ljust(10, " ") + " " + str(sample[1]).ljust( 30, " ") + " " + str(sample[2]).ljust(10, " ") + " " + str(sample[3]).ljust(30, " ") + " " + str(sample[4]).ljust(30, " ") + " " + str(sample[5]).ljust(30, " ") + " ", y_train_flatten[e])) e += 1
def train_and_test(X_train, y_train, X_validate, y_validate, X_test, y_test): model = cnn.get_cnn("regular") y_train_flatted = list(itertools.chain(*y_train)) nb_batch = len(X_train) / batch_size + 1 best_f1 = 0.0 best_f1_index = 0 # Load questions and build idf for them questions_train, v, idf = load_questions_from_file("train", q_limit['train']) questions_validate, v, idf = load_questions_from_file( "validate", q_limit['validate']) questions_test, v, idf = load_questions_from_file("test", q_limit['test']) build_idf([questions_train, questions_validate, questions_test]) # Create samples for loaded questions samples_train = decompose_questions(questions_train) samples_validate = decompose_questions(questions_validate) samples_test = decompose_questions(questions_test) print("len of X_train: %d and samples_train: %d" % (len(X_train), len(samples_train))) for e in range(nb_epoch): print("Epoch %d" % e) progress_bar = generic_utils.Progbar(X_train.shape[0]) # For training NN, shuffle the data X_train_shuffled, y_train_shuffled = shuffle_set( X_train, y_train_flatted) # Train for number of batches for i in range(nb_batch): train_loss, train_accuracy = model.train_on_batch( X_train_shuffled[i * batch_size:(i + 1) * batch_size], y_train_shuffled[i * batch_size:(i + 1) * batch_size], accuracy=True) progress_bar.add(batch_size, values=[("train loss", train_loss), ("train accuracy:", train_accuracy)]) # Check the score on the validation data #results_val = test_model(model, X_validate, y_validate) #best_threshold = find_threshold(y_validate, results_val["y_predicted_scores"], results_val["y_predicted_scores"]) #precision_val, recall_val, f1_val = evaluate_with_threshold(y_validate, results_val["y_predicted_scores"], # results_val["y_predicted_scores"], # best_threshold) # Check the score on the test data #results_test = test_model(model, X_test, y_test) #precision_test, recall_test, f1_test = evaluate_with_threshold(y_test, results_test["y_predicted_scores"], # results_test["y_predicted_scores"], # best_threshold) #nn_string = "NN tests:\n" + "Threshold".ljust(40, ".") + " %.4f" + "\nOver validation set\n" \ # + "validation loss, validation acc".ljust(40, ".") + " %.4f %.4f\n" \ # + "precision, recall, f1".ljust(40, ".") + " %.4f %.4f %.4f\n" \ # + "Over test set\n" \ # + "test loss, test acc".ljust(40, ".") + " %.4f %.4f\n" \ # + "precision, recall, f1".ljust(40, ".") + " %.4f %.4f %.4f\n" \ #globals.logger.info(nn_string % (best_threshold, results_val['test_loss'], results_val['test_accuracy'], # precision_val, recall_val, f1_val, # results_test['test_loss'], results_test['test_accuracy'], # precision_test, recall_test, f1_test)) # Now evaluate with logistic regression # Get predictions from NN predictions_train = model.predict(X_train) predictions_validate = model.predict(X_validate) predictions_test = model.predict(X_test) # Evaluate on logistic regression precision, recall, f1 = validate_on_lr(samples_train, samples_validate, samples_test, predictions_train, predictions_validate, predictions_test, y_train, y_validate, y_test) lr_string = "LR tests:\n" + "Over test set\n" \ + "precision, recall, f1".ljust(40, ".") + " %.4f %.4f %.4f\n" globals.logger.info(lr_string % (precision, recall, f1)) if f1 >= best_f1: best_f1 = f1 best_f1_index = e globals.logger.info( "Training done, best f1 on logistic regression is: %.4f for epoch nr: %d" % (best_f1, best_f1_index))
def train_and_test(X_train, y_train, X_validate, y_validate, X_test, y_test): model = cnn.get_cnn("regular") y_train_flatted = list(itertools.chain(*y_train)) nb_batch = len(X_train)/batch_size + 1 best_f1 = 0.0 best_f1_index = 0 # Load questions and build idf for them questions_train, v, idf = load_questions_from_file("train", q_limit['train']) questions_validate, v, idf = load_questions_from_file("validate", q_limit['validate']) questions_test, v, idf = load_questions_from_file("test", q_limit['test']) build_idf([questions_train, questions_validate, questions_test]) # Create samples for loaded questions samples_train = decompose_questions(questions_train) samples_validate = decompose_questions(questions_validate) samples_test = decompose_questions(questions_test) print("len of X_train: %d and samples_train: %d" % (len(X_train), len(samples_train))) for e in range(nb_epoch): print("Epoch %d" % e) progress_bar = generic_utils.Progbar(X_train.shape[0]) # For training NN, shuffle the data X_train_shuffled, y_train_shuffled = shuffle_set(X_train, y_train_flatted) # Train for number of batches for i in range(nb_batch): train_loss,train_accuracy = model.train_on_batch(X_train_shuffled[i*batch_size:(i+1)*batch_size], y_train_shuffled[i*batch_size:(i+1)*batch_size], accuracy=True) progress_bar.add(batch_size, values=[("train loss", train_loss),("train accuracy:", train_accuracy)]) # Check the score on the validation data #results_val = test_model(model, X_validate, y_validate) #best_threshold = find_threshold(y_validate, results_val["y_predicted_scores"], results_val["y_predicted_scores"]) #precision_val, recall_val, f1_val = evaluate_with_threshold(y_validate, results_val["y_predicted_scores"], # results_val["y_predicted_scores"], # best_threshold) # Check the score on the test data #results_test = test_model(model, X_test, y_test) #precision_test, recall_test, f1_test = evaluate_with_threshold(y_test, results_test["y_predicted_scores"], # results_test["y_predicted_scores"], # best_threshold) #nn_string = "NN tests:\n" + "Threshold".ljust(40, ".") + " %.4f" + "\nOver validation set\n" \ # + "validation loss, validation acc".ljust(40, ".") + " %.4f %.4f\n" \ # + "precision, recall, f1".ljust(40, ".") + " %.4f %.4f %.4f\n" \ # + "Over test set\n" \ # + "test loss, test acc".ljust(40, ".") + " %.4f %.4f\n" \ # + "precision, recall, f1".ljust(40, ".") + " %.4f %.4f %.4f\n" \ #globals.logger.info(nn_string % (best_threshold, results_val['test_loss'], results_val['test_accuracy'], # precision_val, recall_val, f1_val, # results_test['test_loss'], results_test['test_accuracy'], # precision_test, recall_test, f1_test)) # Now evaluate with logistic regression # Get predictions from NN predictions_train = model.predict(X_train) predictions_validate = model.predict(X_validate) predictions_test = model.predict(X_test) # Evaluate on logistic regression precision, recall, f1 = validate_on_lr(samples_train, samples_validate, samples_test, predictions_train, predictions_validate, predictions_test, y_train, y_validate, y_test) lr_string = "LR tests:\n" + "Over test set\n" \ + "precision, recall, f1".ljust(40, ".") + " %.4f %.4f %.4f\n" globals.logger.info(lr_string % (precision, recall, f1)) if f1 >= best_f1: best_f1 = f1 best_f1_index = e globals.logger.info("Training done, best f1 on logistic regression is: %.4f for epoch nr: %d" % (best_f1, best_f1_index))