def populate_database(search_connection, test_data): """Set up the database to conduct searches on the test texts. Fixtures -------- search_connection TessMongoConnection for search unit tests. test_data Example data for unit testing. """ for text in test_data['texts']: tessfile = TessFile(text['path'], metadata=Text(**text)) search_connection.insert(tessfile.metadata) if text['language'] == 'latin': tok = LatinTokenizer(search_connection) unitizer = Unitizer() tokens, tags, features = tok.tokenize(tessfile.read(), text=tessfile.metadata) search_connection.update(features) lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata) search_connection.insert(lines + phrases) search_connection.insert(tokens) yield search_connection.connection['texts'].delete_many({}) search_connection.connection['tokens'].delete_many({}) search_connection.connection['features'].delete_many({}) search_connection.connection['units'].delete_many({}) search_connection.connection['matches'].delete_many({}) search_connection.connection['searches'].delete_many({})
def test_unitize_notag_file(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str(tessfiles_latin_path.joinpath('test.notag.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1
def test_unitize_linebreak_file(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str(tessfiles_latin_path.joinpath('test.linebreak.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1 first_tag = phrases[0].tags[0] for phrase in phrases[1:]: assert phrase.tags[0] == first_tag
def lucvergpop(request, lucverg_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'lucvergtest') for metadata in lucverg_metadata: text = Text.json_decode(metadata) tessfile = TessFile(text.path, metadata=text) conn.insert(text) tokens, tags, features = \ LatinTokenizer(conn).tokenize( tessfile.read(), text=tessfile.metadata) feature_cache = { (f.feature, f.token): f for f in conn.find(Feature.collection, language=text.language) } features_for_insert = [] features_for_update = [] for f in features: if (f.feature, f.token) not in feature_cache: features_for_insert.append(f) feature_cache[(f.feature, f.token)] = f else: f.id = feature_cache[(f.feature, f.token)].id features_for_update.append(f) conn.insert(features_for_insert) conn.update(features_for_update) unitizer = Unitizer() lines, _ = unitizer.unitize(tokens, tags, tessfile.metadata) conn.insert_nocheck(lines) yield conn obliterate(conn)
def test_unitize_diacrit_in_latin(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.diacrit_in_latin.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) forms = {f.index: f.token for f in features if f.feature == 'form'} lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) for phrase in phrases: for t in phrase.tokens: cur_form = t['features']['form'][0] if cur_form != -1: normalized = tokenizer.normalize(t['display'])[0][0] assert normalized == forms[cur_form], phrase.snippet
def test_unitize_linebreak_end(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.linebreak_end.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) print('# lines') for line in lines: print(line.snippet) print('# phrases') for phrase in phrases: print(phrase.snippet) assert len(lines) == 2
def test_unitize_nopunctuation_file(unit_connection, tessfiles_latin_path): # when there is no ending punctuation despite coming to the end of a poem # and another poem starts after a blank line tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.nopunctuation.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 68 for prev_phrase, cur_phrase in zip(phrases[:-1], phrases[1:]): if '2.13' in prev_phrase.tags[0] and '2.14' in cur_phrase.tags[0]: assert prev_phrase.snippet == 'quin et Prometheus et Pelopis parens / dulci laborem decipitur sono / nec curat Orion leones / aut timidos agitare lyncas / Eheu fugaces, Postume, Postume, / labuntur anni nec pietas moram / rugis et instanti senectae / adferet indomitaeque morti, / non, si trecenis quotquot eunt dies, / amice, places inlacrimabilem / Plutona tauris, qui ter amplum / Geryonen Tityonque tristi / conpescit unda, scilicet omnibus / quicumque terrae munere vescimur / enaviganda, sive reges / sive inopes erimus coloni. / ' assert cur_phrase.snippet == 'frustra cruento Marte carebimus / fractisque rauci fluctibus Hadriae, / frustra per autumnos nocentem / corporibus metuemus Austrum: / ' break
def unitizer_inputs(unit_tessfiles, unit_connection): inputs = [] tokenizer_selector = { 'latin': LatinTokenizer(unit_connection), 'greek': GreekTokenizer(unit_connection) } for t in unit_tessfiles: tessfile = TessFile(t.path, metadata=t) tokens, tags, features = tokenizer_selector[t.language].tokenize( tessfile.read(), text=t) features.sort(key=lambda x: x.index) inputs.append((tokens, tags, features)) yield inputs
def test_unitize(self, units): for unit in units: u = Unitizer() metadata = unit['metadata'] tess = TessFile(metadata.path, metadata=metadata) tokens = unit['tokens'] lines = unit['lines'] phrases = unit['phrases'] if metadata.language == 'greek': tokenizer = GreekTokenizer() elif metadata.language == 'latin': tokenizer = LatinTokenizer() tokenizer.clear() for i, line in enumerate(tess.readlines(include_tag=False)): stop = (i == len(tess) - 1) u.unitize(line, metadata, tokenizer=tokenizer, stop=stop) print(metadata.path) assert len(u.lines) == len(lines) for i in range(len(lines)): line_tokens = \ [tokenizer.tokens[j].form for j in u.lines[i].tokens if re.search(r'[\w\d]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in lines[i]['TOKEN_ID'] if 'FORM' in tokens[j] and tokens[j]['FORM']] if line_tokens != correct_tokens: print('Line {}'.format(i)) print(line_tokens) print(correct_tokens) assert line_tokens == correct_tokens print(u.phrases[-1].tokens) assert len(u.phrases) == len(phrases) for i in range(len(u.phrases)): phrase_tokens = \ [tokenizer.tokens[j].form for j in u.phrases[i].tokens if re.search(r'[\w\d]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in phrases[i]['TOKEN_ID'] if 'FORM' in tokens[j] and tokens[j]['FORM']] if phrase_tokens != correct_tokens: print('Phrase {}'.format(i)) phrase_tokens = \ [tokenizer.tokens[j].form for j in u.phrases[i - 1].tokens if re.search(r'[\w]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in phrases[i - 1]['TOKEN_ID'] if 'FORM' in tokens[j]] print(phrase_tokens) print(correct_tokens) assert phrase_tokens == correct_tokens assert len(u.phrases) == len(phrases) u.clear() tokenizer.clear()