Exemple #1
0
 def train(self, output_file, iterations):
     # construct the initial unweighted typo model
     self.fst_mtm = fst_wrapper.get_fst_mtm(self.old_train, self.new_train,
                                            False)
     for _ in range(iterations):  # iterate
         self.num_lines = len(self.old_train)  # for the status bar
         l = 0
         self.status_bar(l)  # track progress with a status bar
         # train on parallel text
         for old_line, new_line in zip(self.old_train, self.new_train):
             # construct the fst models for modern and old lines
             fst_mm = fst_wrapper.get_fst_mw(new_line)
             fst_me = fst_wrapper.get_fst_mw(old_line)
             #compose the models and find the shortest path
             _fst = fst.compose(fst.compose(fst_mm, self.fst_mtm), fst_me)
             viterbi.viterbi_path(fst=_fst, get_counts=True)
             # reweight the tm with the new counts and reweight
             for t, count in viterbi.counts.items():
                 self.fst_mtm.reweight_transition(t, count)
             self.fst_mtm.normalize_cond(.01)
             l += 1
             self.status_bar(l)
         print()  # add a line after the status bar
         self.predict(output_file)
         # print the overall score
         print('SCORE: ', end='')
         print(cer.cer(zip(self.new_test, list(open(output_file)))))
Exemple #2
0
 def test_all(self, output_file):
     # test all lines of test file
     l = 0
     self.status_bar(l)
     with open(output_file, 'w') as f:
         for line in self.old_test:
             f.write(self.predict(line, False))
             l += 1
             self.status_bar(l)
     print()
     print(cer.cer(zip(self.new_test, list(open(output_file)))))
Exemple #3
0
def calculate_cer(words, label):
    label = label.cpu().numpy()
    label = [idx_to_class[l] for l in label]

    err = 0
    st = ""
    for pred, real in zip(words, label):
        err += cer(pred, real)
        st += pred + " vs " + real + "\n"

    with open('check.txt', 'a+') as f:
        f.write(st)

    return err 
Exemple #4
0
def calc_cer(guess, labels, word_lengths, class_to_idx):
    batch_size = guess.shape[0]
    idx_to_class = list(class_to_idx.keys())

    label_words = [
        ''.join(idx_to_class[labels[i, c]] for c in range(word_lengths[i]))
        for i in range(batch_size)
    ]

    cers = [
        cer(guess_word, label_words[i]) for i, guess_word in enumerate(
            generate_guess_strings(guess, class_to_idx))
    ]

    m = torch.mean(torch.Tensor(cers))
    return m
Exemple #5
0
def validate(model, epoch):
    global validateLoss, labelMaxSize, device, validationAcc
    model.eval()
    count = 0
    validation_loss = 0.0
    validation_acc = 0.0
    with torch.no_grad():
        for batch_idx, (data, labels, _) in enumerate(valid_loader):
            data = data.to(device)
            labels = labels.to(device)

            output = model(data)
            pred = pred_word_from_seq(output.permute(1, 0, 2))
            # TODO: remove
            if epoch % 10 == 9:
                print(pred)
            target_classes = label_idx_to_classes(labels)
            accuracy = CER.cer(pred, target_classes)
            validation_acc += accuracy
            # again we want ctc loss!

            # target: NxS s.t: N - Batch size, S - max length of label (target)
            # target_length (N)
            # and target_length (length of each label in the target!)
            # in the function I already transfer to device!
            target, target_length = index_label_to_seq_table(labels)
            # input_length (as described at torch documentation) (N)
            N = output.size()[1]
            T = output.size()[0]
            input_lengths = torch.full(size=(N, ),
                                       fill_value=T,
                                       dtype=torch.long).to(device)

            # operate the ctc loss
            loss = model.ctc_loss(output, target, input_lengths, target_length)

            validation_loss += loss.data.cpu().numpy()
            count += 1
        validation_loss /= count
        print('validation loss: ' + str(validation_loss))
        print("cer rate: " + str(validation_acc / count))

        validateLoss.append(validation_loss)
        validationAcc.append(float(validation_acc) / float(count) * 100)
Exemple #6
0
sub = 0
f = open("cer-all.csv", 'w')
f.write("name, mode, blk, cer\n")
for name in names:
    for mode in modes:
        tot_sentences = 0
        tot_cer = 0
        path = "/Users/clarencewang/Desktop/sensel-typing/Debug/data/" + mode + "/" + name + "/"
        clear_dict = get_clear(path)
        result = read_result(path)
        answer = get_answer(path)

        # calculate character level error rate
        for r, a in zip(result, answer):
            tot_cer += cer(answer[a], result[r])
            tot_sentences += 1
            if (tot_sentences % 8 == 0):
                f.write(name + "," + mode + "," + str(tot_sentences / 8) +
                        "," + str(tot_cer / 8) + "\n")
                tot_cer = 0
                #tot_sentences = 0
            # if(cer(answer[a], result[r]) > 0.1):
            #     print(answer[a], result[r], cer(answer[a], result[r]))

        # calculate word level error rate
        i = 0
        tot_words = 0
        error_num = 0
        for r, a in zip(result, answer):
            # if(len(result[r]) < 3):
Exemple #7
0
                ],
                                               stderr=f).decode('utf-8')


if __name__ == "__main__":
    testDir = '../test/'
    allGameTrainedResults = []
    allGameUntrainedResults = []
    for game in ['bof', 'ffiv', 'ffvi']:
        print("getting metrics for the game", game, "...")
        for trained in [True, False]:
            for root, dirs, files in os.walk(testDir + game + '/'):
                cers = []
                for img_path in files:
                    ocr_info = get_ocr_info(root + img_path, trained)
                    ocr_string = getOCRString(ocr_info).replace('\n', '')
                    # format string by removing all whitespace and file type
                    img_path = ''.join(img_path.split()).replace('.png', '')
                    cers.append((img_path, ocr_string))
                    if trained:
                        allGameTrainedResults.append((img_path, ocr_string))
                    else:
                        allGameUntrainedResults.append((img_path, ocr_string))

                break  # don't go more than one directory deep
            print("trained:", trained, "cer:", cer(cers))
        print()

    print("Metrics for all games combined...")
    print("trained: True cer:", cer(allGameTrainedResults))
    print("trained: False cer:", cer(allGameUntrainedResults))