def test_unitize(self, units): for unit in units: u = Unitizer() metadata = unit['metadata'] tess = TessFile(metadata.path, metadata=metadata) tokens = unit['tokens'] lines = unit['lines'] phrases = unit['phrases'] if metadata.language == 'greek': tokenizer = GreekTokenizer() elif metadata.language == 'latin': tokenizer = LatinTokenizer() tokenizer.clear() for i, line in enumerate(tess.readlines(include_tag=False)): stop = (i == len(tess) - 1) u.unitize(line, metadata, tokenizer=tokenizer, stop=stop) print(metadata.path) assert len(u.lines) == len(lines) for i in range(len(lines)): line_tokens = \ [tokenizer.tokens[j].form for j in u.lines[i].tokens if re.search(r'[\w\d]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in lines[i]['TOKEN_ID'] if 'FORM' in tokens[j] and tokens[j]['FORM']] if line_tokens != correct_tokens: print('Line {}'.format(i)) print(line_tokens) print(correct_tokens) assert line_tokens == correct_tokens print(u.phrases[-1].tokens) assert len(u.phrases) == len(phrases) for i in range(len(u.phrases)): phrase_tokens = \ [tokenizer.tokens[j].form for j in u.phrases[i].tokens if re.search(r'[\w\d]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in phrases[i]['TOKEN_ID'] if 'FORM' in tokens[j] and tokens[j]['FORM']] if phrase_tokens != correct_tokens: print('Phrase {}'.format(i)) phrase_tokens = \ [tokenizer.tokens[j].form for j in u.phrases[i - 1].tokens if re.search(r'[\w]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in phrases[i - 1]['TOKEN_ID'] if 'FORM' in tokens[j]] print(phrase_tokens) print(correct_tokens) assert phrase_tokens == correct_tokens assert len(u.phrases) == len(phrases) u.clear() tokenizer.clear()
def test_clear(self): u = Unitizer() vals = list(range(0, 100)) u.lines.extend(vals) u.clear() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == [] u.lines.extend(vals) u.phrases.extend(vals) u.clear() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == [] u.lines.extend(vals) u.phrases.extend(vals) u.clear() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == [] for i in [None, 'a', 1, 1.0, True, False, b'a', r'a']: u.lines = i u.clear() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == [] u.phrases = i u.clear() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == [] u.lines = i u.phrases = i u.clear() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == []