def train(self, output_file, iterations): # construct the initial unweighted typo model self.fst_mtm = fst_wrapper.get_fst_mtm(self.old_train, self.new_train, False) for _ in range(iterations): # iterate self.num_lines = len(self.old_train) # for the status bar l = 0 self.status_bar(l) # track progress with a status bar # train on parallel text for old_line, new_line in zip(self.old_train, self.new_train): # construct the fst models for modern and old lines fst_mm = fst_wrapper.get_fst_mw(new_line) fst_me = fst_wrapper.get_fst_mw(old_line) #compose the models and find the shortest path _fst = fst.compose(fst.compose(fst_mm, self.fst_mtm), fst_me) viterbi.viterbi_path(fst=_fst, get_counts=True) # reweight the tm with the new counts and reweight for t, count in viterbi.counts.items(): self.fst_mtm.reweight_transition(t, count) self.fst_mtm.normalize_cond(.01) l += 1 self.status_bar(l) print() # add a line after the status bar self.predict(output_file) # print the overall score print('SCORE: ', end='') print(cer.cer(zip(self.new_test, list(open(output_file)))))
def test_all(self, output_file): # test all lines of test file l = 0 self.status_bar(l) with open(output_file, 'w') as f: for line in self.old_test: f.write(self.predict(line, False)) l += 1 self.status_bar(l) print() print(cer.cer(zip(self.new_test, list(open(output_file)))))
def calculate_cer(words, label): label = label.cpu().numpy() label = [idx_to_class[l] for l in label] err = 0 st = "" for pred, real in zip(words, label): err += cer(pred, real) st += pred + " vs " + real + "\n" with open('check.txt', 'a+') as f: f.write(st) return err
def calc_cer(guess, labels, word_lengths, class_to_idx): batch_size = guess.shape[0] idx_to_class = list(class_to_idx.keys()) label_words = [ ''.join(idx_to_class[labels[i, c]] for c in range(word_lengths[i])) for i in range(batch_size) ] cers = [ cer(guess_word, label_words[i]) for i, guess_word in enumerate( generate_guess_strings(guess, class_to_idx)) ] m = torch.mean(torch.Tensor(cers)) return m
def validate(model, epoch): global validateLoss, labelMaxSize, device, validationAcc model.eval() count = 0 validation_loss = 0.0 validation_acc = 0.0 with torch.no_grad(): for batch_idx, (data, labels, _) in enumerate(valid_loader): data = data.to(device) labels = labels.to(device) output = model(data) pred = pred_word_from_seq(output.permute(1, 0, 2)) # TODO: remove if epoch % 10 == 9: print(pred) target_classes = label_idx_to_classes(labels) accuracy = CER.cer(pred, target_classes) validation_acc += accuracy # again we want ctc loss! # target: NxS s.t: N - Batch size, S - max length of label (target) # target_length (N) # and target_length (length of each label in the target!) # in the function I already transfer to device! target, target_length = index_label_to_seq_table(labels) # input_length (as described at torch documentation) (N) N = output.size()[1] T = output.size()[0] input_lengths = torch.full(size=(N, ), fill_value=T, dtype=torch.long).to(device) # operate the ctc loss loss = model.ctc_loss(output, target, input_lengths, target_length) validation_loss += loss.data.cpu().numpy() count += 1 validation_loss /= count print('validation loss: ' + str(validation_loss)) print("cer rate: " + str(validation_acc / count)) validateLoss.append(validation_loss) validationAcc.append(float(validation_acc) / float(count) * 100)
sub = 0 f = open("cer-all.csv", 'w') f.write("name, mode, blk, cer\n") for name in names: for mode in modes: tot_sentences = 0 tot_cer = 0 path = "/Users/clarencewang/Desktop/sensel-typing/Debug/data/" + mode + "/" + name + "/" clear_dict = get_clear(path) result = read_result(path) answer = get_answer(path) # calculate character level error rate for r, a in zip(result, answer): tot_cer += cer(answer[a], result[r]) tot_sentences += 1 if (tot_sentences % 8 == 0): f.write(name + "," + mode + "," + str(tot_sentences / 8) + "," + str(tot_cer / 8) + "\n") tot_cer = 0 #tot_sentences = 0 # if(cer(answer[a], result[r]) > 0.1): # print(answer[a], result[r], cer(answer[a], result[r])) # calculate word level error rate i = 0 tot_words = 0 error_num = 0 for r, a in zip(result, answer): # if(len(result[r]) < 3):
], stderr=f).decode('utf-8') if __name__ == "__main__": testDir = '../test/' allGameTrainedResults = [] allGameUntrainedResults = [] for game in ['bof', 'ffiv', 'ffvi']: print("getting metrics for the game", game, "...") for trained in [True, False]: for root, dirs, files in os.walk(testDir + game + '/'): cers = [] for img_path in files: ocr_info = get_ocr_info(root + img_path, trained) ocr_string = getOCRString(ocr_info).replace('\n', '') # format string by removing all whitespace and file type img_path = ''.join(img_path.split()).replace('.png', '') cers.append((img_path, ocr_string)) if trained: allGameTrainedResults.append((img_path, ocr_string)) else: allGameUntrainedResults.append((img_path, ocr_string)) break # don't go more than one directory deep print("trained:", trained, "cer:", cer(cers)) print() print("Metrics for all games combined...") print("trained: True cer:", cer(allGameTrainedResults)) print("trained: False cer:", cer(allGameUntrainedResults))