def evaluate(filename="data/classes/test_retokenized.txt"): lines, tags = read_valid_lines(filename) correct = 0.0 for line, tag in tqdm(zip(lines, tags)): w_i, c_i = get_word_and_char_indices(line) pred = predict(w_i, c_i) if pred == t2i[tag]: correct += 1.0 log.pr_green("accuracy of the model on test set = %.4f [No spell checks]" % (correct / len(lines))) return
def get_qualitative_examples(): lines, tags = read_valid_lines("data/classes/test_retokenized.txt") c = list(zip(lines, tags)) random.shuffle(c) lines, tags = zip(*c) lines = lines[:200] tags = tags[:200] for line, tag in tqdm(zip(lines, tags)): w_i, c_i = get_word_and_char_indices(line) # check if model prediction is incorrect, if yes, find next example... model_prediction = predict(w_i, c_i) if t2i[tag] != model_prediction: # already incorrect, not interesting... continue gen_attacks = attacks.all_one_attack(line) for idx, adversary in gen_attacks: #adversary = checker.correct_string(adversary) w_i, c_i = get_word_and_char_indices(adversary) adv_pred = predict(w_i, c_i) if adv_pred == t2i[tag]: # this example doesn't break the model... continue corrected_string = checker.correct_string(adversary) w_i, c_i = get_word_and_char_indices(corrected_string) post_pred = predict(w_i, c_i) if post_pred != t2i[tag]: # after correction the tag isn't correct... continue log.pr(" -------------- ") log.pr("Original line = %s" % (line)) log.pr("Original label = %s" % (tag)) log.pr_red("Adversary = %s" % (adversary)) log.pr_green("Correction = %s" % (corrected_string)) log.pr(" -------------- ") return None
def evaluate(filename="data/classes/test.txt"): if(singles): lines, tags = read_valid_lines_single(filename) #[premise,hypothesis], tags correct = 0.0 for line, tag in tqdm(zip(lines, tags)): w_i, c_i = get_word_and_char_indices(line) pred = predict(w_i, c_i) if pred == tag: correct += 1.0 log.pr_green("accuracy of the model on test set = %.4f [No spell checks]" % (correct / len(lines))) return else: lines, tags = read_valid_lines_single(filename) correct = 0.0 for line in tqdm(lines): w_i, c_i = get_word_and_char_indices(line) pred = predict(w_i, c_i) if pred == line[2]: correct += 1.0 #[prem, hyp, tag] log.pr_green("accuracy of the model on test set = %.4f [No spell checks]" % (correct / len(lines))) return
| Train Attn Mass: {train_attn_mass:0.2f} | Train PPL: {math.exp(train_loss):7.3f}' ) print(f'\t Val. Loss: {valid_loss:.3f} | Val Acc: {val_acc:0.2f} \ | Val. Attn Mass: {val_attn_mass:0.2f} | Val. PPL: {math.exp(valid_loss):7.3f}' ) # load the best model and print stats: model.load_state_dict(torch.load('data/models/model_' + TASK + SUFFIX + \ '_seed=' + str(SEED) + '_coeff=' + str(COEFF) + '_num-train=' + str(NUM_TRAIN) + '.pt')) test_loss, test_acc, test_attn_mass = evaluate(model, test_batches, criterion) print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc:0.2f} \ | Test Attn Mass: {test_attn_mass:0.2f} | Test PPL: {math.exp(test_loss):7.3f}' ) log.pr_green(f"Final Test Accuracy ..........\t{test_acc:0.2f}") log.pr_green(f"Final Test Attention Mass ....\t{test_attn_mass:0.2f}") log.pr_green(f"Convergence time in seconds ..\t{convergence_time:0.2f}") log.pr_green(f"Sample efficiency in epochs ..\t{epochs_taken_to_converge}") src_lang.save_vocab("data/vocab/" + TASK + SUFFIX + '_seed=' + str(SEED) \ + '_coeff=' + str(COEFF) + '_num-train=' + str(NUM_TRAIN) + ".src.vocab") trg_lang.save_vocab("data/vocab/" + TASK + SUFFIX + '_seed=' + str(SEED) \ + '_coeff=' + str(COEFF) + '_num-train=' + str(NUM_TRAIN) + ".trg.vocab") if TASK in ['en-hi', 'en-de']: # generate the output to copmute bleu scores as well... log.pr_green("generating the output translations from the model") test_batches_single = list( get_batches(test_sents[0], test_sents[1], test_sents[2], 1))