Beispiel #1
0
def evaluate(filename="data/classes/test_retokenized.txt"):
    lines, tags = read_valid_lines(filename)
    correct = 0.0
    for line, tag in tqdm(zip(lines, tags)):
        w_i, c_i = get_word_and_char_indices(line)
        pred = predict(w_i, c_i)
        if pred == t2i[tag]: correct += 1.0
    log.pr_green("accuracy of the model on test set = %.4f [No spell checks]" %
                 (correct / len(lines)))
    return
Beispiel #2
0
def get_qualitative_examples():
    lines, tags = read_valid_lines("data/classes/test_retokenized.txt")
    c = list(zip(lines, tags))
    random.shuffle(c)
    lines, tags = zip(*c)
    lines = lines[:200]
    tags = tags[:200]

    for line, tag in tqdm(zip(lines, tags)):

        w_i, c_i = get_word_and_char_indices(line)

        # check if model prediction is incorrect, if yes, find next example...
        model_prediction = predict(w_i, c_i)
        if t2i[tag] != model_prediction:
            # already incorrect, not interesting...
            continue

        gen_attacks = attacks.all_one_attack(line)

        for idx, adversary in gen_attacks:
            #adversary = checker.correct_string(adversary)
            w_i, c_i = get_word_and_char_indices(adversary)

            adv_pred = predict(w_i, c_i)

            if adv_pred == t2i[tag]:
                # this example doesn't break the model...
                continue

            corrected_string = checker.correct_string(adversary)
            w_i, c_i = get_word_and_char_indices(corrected_string)

            post_pred = predict(w_i, c_i)

            if post_pred != t2i[tag]:
                # after correction the tag isn't correct...
                continue

            log.pr(" -------------- ")
            log.pr("Original line = %s" % (line))
            log.pr("Original label = %s" % (tag))
            log.pr_red("Adversary = %s" % (adversary))
            log.pr_green("Correction = %s" % (corrected_string))
            log.pr(" -------------- ")

    return None
Beispiel #3
0
def evaluate(filename="data/classes/test.txt"):
    if(singles):
        lines, tags = read_valid_lines_single(filename) #[premise,hypothesis], tags
        correct = 0.0
        for line, tag in tqdm(zip(lines, tags)):
            w_i, c_i = get_word_and_char_indices(line)
            pred = predict(w_i, c_i)
            if pred == tag: correct += 1.0
        log.pr_green("accuracy of the model on test set = %.4f [No spell checks]" % (correct / len(lines)))
        return
    else:
        lines, tags = read_valid_lines_single(filename)
        correct = 0.0
        for line in tqdm(lines):
            w_i, c_i = get_word_and_char_indices(line)
            pred = predict(w_i, c_i)
            if pred == line[2]:
                correct += 1.0 #[prem, hyp, tag]
        log.pr_green("accuracy of the model on test set = %.4f [No spell checks]" % (correct / len(lines)))
        return
        | Train Attn Mass: {train_attn_mass:0.2f} | Train PPL: {math.exp(train_loss):7.3f}'
          )
    print(f'\t Val. Loss: {valid_loss:.3f} |   Val Acc: {val_acc:0.2f} \
        |  Val. Attn Mass: {val_attn_mass:0.2f} |  Val. PPL: {math.exp(valid_loss):7.3f}'
          )

# load the best model and print stats:
model.load_state_dict(torch.load('data/models/model_' + TASK + SUFFIX + \
    '_seed=' + str(SEED) + '_coeff=' + str(COEFF) + '_num-train=' + str(NUM_TRAIN) + '.pt'))

test_loss, test_acc, test_attn_mass = evaluate(model, test_batches, criterion)
print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc:0.2f} \
        |  Test Attn Mass: {test_attn_mass:0.2f} |  Test PPL: {math.exp(test_loss):7.3f}'
      )

log.pr_green(f"Final Test Accuracy ..........\t{test_acc:0.2f}")
log.pr_green(f"Final Test Attention Mass ....\t{test_attn_mass:0.2f}")
log.pr_green(f"Convergence time in seconds ..\t{convergence_time:0.2f}")
log.pr_green(f"Sample efficiency in epochs ..\t{epochs_taken_to_converge}")


src_lang.save_vocab("data/vocab/" + TASK + SUFFIX + '_seed=' + str(SEED) \
     + '_coeff=' + str(COEFF) +  '_num-train=' + str(NUM_TRAIN) + ".src.vocab")
trg_lang.save_vocab("data/vocab/" + TASK + SUFFIX + '_seed=' + str(SEED) \
     + '_coeff=' + str(COEFF) +  '_num-train=' + str(NUM_TRAIN) + ".trg.vocab")

if TASK in ['en-hi', 'en-de']:
    # generate the output to copmute bleu scores as well...
    log.pr_green("generating the output translations from the model")
    test_batches_single = list(
        get_batches(test_sents[0], test_sents[1], test_sents[2], 1))