def test_unitize_elision_file(unit_connection, tessfiles_greek_path): tokenizer = GreekTokenizer(unit_connection) t = Text(path=str(tessfiles_greek_path.joinpath('test.elision.tess')), language='greek') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1
def test_normalize(token_connection, greek_tessfiles, greek_tokens): grc = GreekTokenizer(token_connection) for i, tessfile in enumerate(greek_tessfiles): correct_tokens = [t for t in greek_tokens[i] if t['form']] tokens, tags = grc.normalize(tessfile.read()) tokens = [t for t in tokens if re.search(r'[\w]+', t)] correct = map(lambda x: x[0] == x[1]['form'], zip(tokens, correct_tokens)) for j, c in enumerate(correct): if not c: print(j, tokens[j], correct_tokens[j]) break assert all(correct) for i, line in enumerate(tessfile.readlines()): correct_tag = line[:line.find('>') + 1] assert tags[i] == correct_tag
def unitizer_inputs(unit_tessfiles, unit_connection): inputs = [] tokenizer_selector = { 'latin': LatinTokenizer(unit_connection), 'greek': GreekTokenizer(unit_connection) } for t in unit_tessfiles: tessfile = TessFile(t.path, metadata=t) tokens, tags, features = tokenizer_selector[t.language].tokenize( tessfile.read(), text=t) features.sort(key=lambda x: x.index) inputs.append((tokens, tags, features)) yield inputs
def test_tokenize(token_connection, greek_tessfiles, greek_tokens): grc = GreekTokenizer(token_connection) for i, tessfile in enumerate(greek_tessfiles): print(tessfile.metadata.title) tokens, tags, features = grc.tokenize(tessfile.read(), text=tessfile.metadata) tokens = [t for t in tokens if re.search(r'[\w]+', t.display)] for j, token in enumerate(tokens): # Detect all connected assert token.display == greek_tokens[i][j]['display'] # if tessfile.metadata.title == 'gorgias': # print(token.display, greek_tokens[i][j]) # print(token.display, token.features['form'].token, [t.token for t in token.features['lemmata']]) # print(greek_tokens[i][j]) assert token.features['form'].token == greek_tokens[i][j]['form'] assert all([ any( map(lambda x: lemma.token == x, greek_tokens[i][j]['lemmata'])) for lemma in token.features['lemmata'] ])
def greek_word_frequencies(greek_files): freqs = [] grc = GreekTokenizer() for fname in greek_files: freq = {} fname = os.path.splitext(fname)[0] + '.freq_score_word' with open(fname, 'r') as f: for line in f.readlines(): if '#' not in line: word, n = re.split('[^\w' + grc.diacriticals + ']+', line, flags=re.UNICODE)[:-1] freq[word] = int(n) freqs.append(freq) return freqs
def test_unitize(self, units): for unit in units: u = Unitizer() metadata = unit['metadata'] tess = TessFile(metadata.path, metadata=metadata) tokens = unit['tokens'] lines = unit['lines'] phrases = unit['phrases'] if metadata.language == 'greek': tokenizer = GreekTokenizer() elif metadata.language == 'latin': tokenizer = LatinTokenizer() tokenizer.clear() for i, line in enumerate(tess.readlines(include_tag=False)): stop = (i == len(tess) - 1) u.unitize(line, metadata, tokenizer=tokenizer, stop=stop) print(metadata.path) assert len(u.lines) == len(lines) for i in range(len(lines)): line_tokens = \ [tokenizer.tokens[j].form for j in u.lines[i].tokens if re.search(r'[\w\d]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in lines[i]['TOKEN_ID'] if 'FORM' in tokens[j] and tokens[j]['FORM']] if line_tokens != correct_tokens: print('Line {}'.format(i)) print(line_tokens) print(correct_tokens) assert line_tokens == correct_tokens print(u.phrases[-1].tokens) assert len(u.phrases) == len(phrases) for i in range(len(u.phrases)): phrase_tokens = \ [tokenizer.tokens[j].form for j in u.phrases[i].tokens if re.search(r'[\w\d]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in phrases[i]['TOKEN_ID'] if 'FORM' in tokens[j] and tokens[j]['FORM']] if phrase_tokens != correct_tokens: print('Phrase {}'.format(i)) phrase_tokens = \ [tokenizer.tokens[j].form for j in u.phrases[i - 1].tokens if re.search(r'[\w]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in phrases[i - 1]['TOKEN_ID'] if 'FORM' in tokens[j]] print(phrase_tokens) print(correct_tokens) assert phrase_tokens == correct_tokens assert len(u.phrases) == len(phrases) u.clear() tokenizer.clear()
def test_init(token_connection): t = GreekTokenizer(token_connection) assert t.connection is token_connection assert hasattr(t, 'lemmatizer') assert isinstance(t.lemmatizer, Lemmata)