def test_readlines(self, tessfile_list): for f in tessfile_list: lines = [] with open(f, 'r') as tess: for line in tess.readlines(): lines.append(line) # Ensure that readlines works with a buffer t = TessFile(f) for i, line in enumerate(t.readlines()): assert line == lines[i] # Ensure that the buffer resets on second call reset = False for i, line in enumerate(t.readlines()): assert line == lines[i] reset = True assert reset # Ensure that readlines works with initial read t = TessFile(f, buffer=False) for i, line in enumerate(t.readlines()): assert line == lines[i] # Ensure that the iterator resets on second call reset = False for i, line in enumerate(t.readlines()): assert line == lines[i] reset = True assert reset
def test_normalize(self, greek_files, greek_tokens): grc = self.__test_class__() for i in range(len(greek_files)): fname = greek_files[i] ref_tokens = [t for t in greek_tokens[i] if t['FORM'] != ''] t = TessFile(fname) token_idx = 0 for i, line in enumerate(t.readlines(include_tag=False)): tokens = [t for t in grc.normalize(line)] tokens = [ t for t in tokens if re.search( '[' + grc.word_characters + ']+', t, flags=re.UNICODE) ] offset = token_idx + len(tokens) correct = map(lambda x: x[0] == x[1]['FORM'], zip(tokens, ref_tokens[token_idx:offset])) if not all(correct): print(fname, i, line) print(ref_tokens[token_idx:offset]) for j in range(len(tokens)): if tokens[j] != ref_tokens[token_idx + j]['FORM']: print('{}->{}'.format( tokens[j], ref_tokens[token_idx + j]['FORM'])) assert all(correct) token_idx = offset
def test_unitize(self, units): for unit in units: u = Unitizer() metadata = unit['metadata'] tess = TessFile(metadata.path, metadata=metadata) tokens = unit['tokens'] lines = unit['lines'] phrases = unit['phrases'] if metadata.language == 'greek': tokenizer = GreekTokenizer() elif metadata.language == 'latin': tokenizer = LatinTokenizer() tokenizer.clear() for i, line in enumerate(tess.readlines(include_tag=False)): stop = (i == len(tess) - 1) u.unitize(line, metadata, tokenizer=tokenizer, stop=stop) print(metadata.path) assert len(u.lines) == len(lines) for i in range(len(lines)): line_tokens = \ [tokenizer.tokens[j].form for j in u.lines[i].tokens if re.search(r'[\w\d]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in lines[i]['TOKEN_ID'] if 'FORM' in tokens[j] and tokens[j]['FORM']] if line_tokens != correct_tokens: print('Line {}'.format(i)) print(line_tokens) print(correct_tokens) assert line_tokens == correct_tokens print(u.phrases[-1].tokens) assert len(u.phrases) == len(phrases) for i in range(len(u.phrases)): phrase_tokens = \ [tokenizer.tokens[j].form for j in u.phrases[i].tokens if re.search(r'[\w\d]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in phrases[i]['TOKEN_ID'] if 'FORM' in tokens[j] and tokens[j]['FORM']] if phrase_tokens != correct_tokens: print('Phrase {}'.format(i)) phrase_tokens = \ [tokenizer.tokens[j].form for j in u.phrases[i - 1].tokens if re.search(r'[\w]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in phrases[i - 1]['TOKEN_ID'] if 'FORM' in tokens[j]] print(phrase_tokens) print(correct_tokens) assert phrase_tokens == correct_tokens assert len(u.phrases) == len(phrases) u.clear() tokenizer.clear()