def populate_database(search_connection, test_data): """Set up the database to conduct searches on the test texts. Fixtures -------- search_connection TessMongoConnection for search unit tests. test_data Example data for unit testing. """ for text in test_data['texts']: tessfile = TessFile(text['path'], metadata=Text(**text)) search_connection.insert(tessfile.metadata) if text['language'] == 'latin': tok = LatinTokenizer(search_connection) unitizer = Unitizer() tokens, tags, features = tok.tokenize(tessfile.read(), text=tessfile.metadata) search_connection.update(features) lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata) search_connection.insert(lines + phrases) search_connection.insert(tokens) yield search_connection.connection['texts'].delete_many({}) search_connection.connection['tokens'].delete_many({}) search_connection.connection['features'].delete_many({}) search_connection.connection['units'].delete_many({}) search_connection.connection['matches'].delete_many({}) search_connection.connection['searches'].delete_many({})
def test_unitize_elision_file(unit_connection, tessfiles_greek_path): tokenizer = GreekTokenizer(unit_connection) t = Text(path=str(tessfiles_greek_path.joinpath('test.elision.tess')), language='greek') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1
def test_unitize_notag_file(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str(tessfiles_latin_path.joinpath('test.notag.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1
def test_unitize_linebreak_file(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str(tessfiles_latin_path.joinpath('test.linebreak.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1 first_tag = phrases[0].tags[0] for phrase in phrases[1:]: assert phrase.tags[0] == first_tag
def unitizer_inputs(unit_tessfiles, unit_connection): inputs = [] tokenizer_selector = { 'latin': LatinTokenizer(unit_connection), 'greek': GreekTokenizer(unit_connection) } for t in unit_tessfiles: tessfile = TessFile(t.path, metadata=t) tokens, tags, features = tokenizer_selector[t.language].tokenize( tessfile.read(), text=t) features.sort(key=lambda x: x.index) inputs.append((tokens, tags, features)) yield inputs
def test_normalize(self, latin_files, latin_tokens): la = self.__test_class__() for i in range(len(latin_files)): fname = latin_files[i] ref_tokens = [t for t in latin_tokens[i] if 'FORM' in t] t = TessFile(fname) tokens = la.normalize(t.read()) correct = map( lambda x: ('FORM' in x[1] and x[0] == x[1]['FORM']) or x[0] == '', zip(tokens, ref_tokens))
def test_unitize_diacrit_in_latin(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.diacrit_in_latin.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) forms = {f.index: f.token for f in features if f.feature == 'form'} lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) for phrase in phrases: for t in phrase.tokens: cur_form = t['features']['form'][0] if cur_form != -1: normalized = tokenizer.normalize(t['display'])[0][0] assert normalized == forms[cur_form], phrase.snippet
def test_unitize_linebreak_end(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.linebreak_end.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) print('# lines') for line in lines: print(line.snippet) print('# phrases') for phrase in phrases: print(phrase.snippet) assert len(lines) == 2
def test_unitize_nopunctuation_file(unit_connection, tessfiles_latin_path): # when there is no ending punctuation despite coming to the end of a poem # and another poem starts after a blank line tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.nopunctuation.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 68 for prev_phrase, cur_phrase in zip(phrases[:-1], phrases[1:]): if '2.13' in prev_phrase.tags[0] and '2.14' in cur_phrase.tags[0]: assert prev_phrase.snippet == 'quin et Prometheus et Pelopis parens / dulci laborem decipitur sono / nec curat Orion leones / aut timidos agitare lyncas / Eheu fugaces, Postume, Postume, / labuntur anni nec pietas moram / rugis et instanti senectae / adferet indomitaeque morti, / non, si trecenis quotquot eunt dies, / amice, places inlacrimabilem / Plutona tauris, qui ter amplum / Geryonen Tityonque tristi / conpescit unda, scilicet omnibus / quicumque terrae munere vescimur / enaviganda, sive reges / sive inopes erimus coloni. / ' assert cur_phrase.snippet == 'frustra cruento Marte carebimus / fractisque rauci fluctibus Hadriae, / frustra per autumnos nocentem / corporibus metuemus Austrum: / ' break
def test_tokenize(self, greek_files, greek_tokens, greek_word_frequencies): grc = self.__test_class__() for k in range(len(greek_files)): fname = greek_files[k] ref_tokens = [t for t in greek_tokens[k] if 'FORM' in t] ref_freqs = greek_word_frequencies[k] t = TessFile(fname) tokens, frequencies = grc.tokenize(t.read()) tokens = [ t for t in tokens if re.search('[\w]', t.display, flags=re.UNICODE) ] correct = map(lambda x: x[0].display == x[1]['DISPLAY'], zip(tokens, ref_tokens)) if not all(correct): print(fname) for j in range(len(tokens)): if tokens[j].display != ref_tokens[j]['DISPLAY']: print(ref_tokens[j]) print('{}->{}'.format(tokens[j].display, ref_tokens[j]['DISPLAY'])) print('{}->{}'.format(tokens[j].form, ref_tokens[j]['FORM'])) assert all(correct) correct = map(lambda x: x[0].form == x[1]['FORM'], zip(tokens, ref_tokens)) if not all(correct): print(fname) for j in range(len(tokens)): if tokens[j].form != ref_tokens[j]['FORM']: print(ref_tokens[j]) print('{}->{}'.format(tokens[j].form, ref_tokens[j]['FORM'])) assert all(correct)
def test_tokenize(self, latin_files, latin_tokens, latin_word_frequencies): la = self.__test_class__() for k in range(len(latin_files)): fname = latin_files[k] ref_tokens = [t for t in latin_tokens[k] if 'FORM' in t] ref_freqs = latin_word_frequencies[k] t = TessFile(fname) tokens, frequencies = la.tokenize(t.read(), text=t.metadata) tokens = [ t for t in tokens if re.search(r'^[a-zA-Z]+$', t.display, flags=re.UNICODE) ] correct = map(lambda x: x[0].display == x[1]['DISPLAY'], zip(tokens, ref_tokens)) if not all(correct): print(fname) for j in range(len(tokens)): if tokens[j].display != ref_tokens[j]['DISPLAY']: print('{}->{}'.format(tokens[j].display, ref_tokens[j]['DISPLAY'])) assert all(correct) correct = map( lambda x: ('FORM' in x[1] and x[0].form == x[1]['FORM']) or not x[0].form, zip(tokens, ref_tokens)) if not all(correct): print(fname) # for j in range(len(tokens)): # if tokens[j].form != ref_tokens[j]['FORM']: # print('{}->{}'.format(tokens[j].form, ref_tokens[j]['FORM'])) assert all(correct) for key in ref_freqs: assert key in la.frequencies assert la.frequencies[key] == ref_freqs[key] diff = [] for word in frequencies: if word.form not in ref_freqs and re.search( r'[a-zA-Z]', word.form, flags=re.UNICODE): diff.append(word.form) print(diff) assert len(diff) == 0 keys = sorted(list(ref_freqs.keys())) frequencies.sort(key=lambda x: x.form) correct = map( lambda x: x[0].form == x[1] and x[0].frequency == ref_freqs[x[ 1]], zip(frequencies, keys)) assert all(correct) la.clear()