def populate_database(search_connection, test_data): """Set up the database to conduct searches on the test texts. Fixtures -------- search_connection TessMongoConnection for search unit tests. test_data Example data for unit testing. """ for text in test_data['texts']: tessfile = TessFile(text['path'], metadata=Text(**text)) search_connection.insert(tessfile.metadata) if text['language'] == 'latin': tok = LatinTokenizer(search_connection) unitizer = Unitizer() tokens, tags, features = tok.tokenize(tessfile.read(), text=tessfile.metadata) search_connection.update(features) lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata) search_connection.insert(lines + phrases) search_connection.insert(tokens) yield search_connection.connection['texts'].delete_many({}) search_connection.connection['tokens'].delete_many({}) search_connection.connection['features'].delete_many({}) search_connection.connection['units'].delete_many({}) search_connection.connection['matches'].delete_many({}) search_connection.connection['searches'].delete_many({})
def lucvergpop(request, lucverg_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'lucvergtest') for metadata in lucverg_metadata: text = Text.json_decode(metadata) tessfile = TessFile(text.path, metadata=text) conn.insert(text) tokens, tags, features = \ LatinTokenizer(conn).tokenize( tessfile.read(), text=tessfile.metadata) feature_cache = { (f.feature, f.token): f for f in conn.find(Feature.collection, language=text.language) } features_for_insert = [] features_for_update = [] for f in features: if (f.feature, f.token) not in feature_cache: features_for_insert.append(f) feature_cache[(f.feature, f.token)] = f else: f.id = feature_cache[(f.feature, f.token)].id features_for_update.append(f) conn.insert(features_for_insert) conn.update(features_for_update) unitizer = Unitizer() lines, _ = unitizer.unitize(tokens, tags, tessfile.metadata) conn.insert_nocheck(lines) yield conn obliterate(conn)
def test_unitize_elision_file(unit_connection, tessfiles_greek_path): tokenizer = GreekTokenizer(unit_connection) t = Text(path=str(tessfiles_greek_path.joinpath('test.elision.tess')), language='greek') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1
def test_unitize_notag_file(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str(tessfiles_latin_path.joinpath('test.notag.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1
def ingest_text(connection, text): """Update database with a new text ``text`` must not already exist in the database Parameters ---------- connection : tesserae.db.TessMongoConnection A connection to the database text : tesserae.db.entities.Text The text to be ingested Returns ------- ObjectId database identifier for the Text object just added Raises ------ ValueError Raised when unknown language is encountered """ if text.language not in _tokenizers: raise ValueError('Unknown language: {}'.format(text.language)) tessfile = TessFile(text.path, metadata=text) result = connection.insert(text) text_id = result.inserted_ids[0] tokens, tags, features = \ _tokenizers[tessfile.metadata.language](connection).tokenize( tessfile.read(), text=tessfile.metadata) feature_cache = { (f.feature, f.token): f for f in connection.find(Feature.collection, language=text.language) } features_for_insert = [] features_for_update = [] for f in features: if (f.feature, f.token) not in feature_cache: features_for_insert.append(f) feature_cache[(f.feature, f.token)] = f else: f.id = feature_cache[(f.feature, f.token)].id features_for_update.append(f) insert_features_result = connection.insert(features_for_insert) update_features_result = connection.update(features_for_update) unitizer = Unitizer() lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata) result = connection.insert_nocheck(tokens) result = connection.insert_nocheck(lines + phrases) return text_id
def test_unitize_linebreak_file(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str(tessfiles_latin_path.joinpath('test.linebreak.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1 first_tag = phrases[0].tags[0] for phrase in phrases[1:]: assert phrase.tags[0] == first_tag
def _ingest_tessfile(connection, text, tessfile, enable_multitext=False): """Process .tess file for inclusion in Tesserae database Parameters ---------- connection : tesserae.db.TessMongoConnection A connection to the database text : tesserae.db.entities.Text Text entity associated with the .tess file to be ingested; must already be added to Text.collection but not yet ingested tessfile : tesserae.utils.TessFile .tess file to be ingested enable_multitext : bool (default: False) Whether to enable multitext search with this text """ tokens, tags, features = \ _tokenizers[tessfile.metadata.language](connection).tokenize( tessfile.read(), text=tessfile.metadata) text.divisions = _extract_divisions(tags) connection.update(text) feature_cache = { (f.feature, f.token): f for f in connection.find(Feature.collection, language=text.language) } features_for_insert = [] features_for_update = [] for f in features: if (f.feature, f.token) not in feature_cache: features_for_insert.append(f) feature_cache[(f.feature, f.token)] = f else: f.id = feature_cache[(f.feature, f.token)].id features_for_update.append(f) connection.insert(features_for_insert) connection.update(features_for_update) unitizer = Unitizer() lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata) features_ingested = {feature for feature in lines[0].tokens[0]['features']} for feature in features_ingested: text.update_ingestion_details(feature, NORMAL_SEARCH, TextStatus.DONE, '') connection.update(text) connection.insert_nocheck(tokens) connection.insert_nocheck(lines + phrases) if enable_multitext: register_bigrams(connection, text)
def test_unitize_diacrit_in_latin(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.diacrit_in_latin.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) forms = {f.index: f.token for f in features if f.feature == 'form'} lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) for phrase in phrases: for t in phrase.tokens: cur_form = t['features']['form'][0] if cur_form != -1: normalized = tokenizer.normalize(t['display'])[0][0] assert normalized == forms[cur_form], phrase.snippet
def test_unitize_linebreak_end(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.linebreak_end.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) print('# lines') for line in lines: print(line.snippet) print('# phrases') for phrase in phrases: print(phrase.snippet) assert len(lines) == 2
def test_unitize_nopunctuation_file(unit_connection, tessfiles_latin_path): # when there is no ending punctuation despite coming to the end of a poem # and another poem starts after a blank line tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.nopunctuation.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 68 for prev_phrase, cur_phrase in zip(phrases[:-1], phrases[1:]): if '2.13' in prev_phrase.tags[0] and '2.14' in cur_phrase.tags[0]: assert prev_phrase.snippet == 'quin et Prometheus et Pelopis parens / dulci laborem decipitur sono / nec curat Orion leones / aut timidos agitare lyncas / Eheu fugaces, Postume, Postume, / labuntur anni nec pietas moram / rugis et instanti senectae / adferet indomitaeque morti, / non, si trecenis quotquot eunt dies, / amice, places inlacrimabilem / Plutona tauris, qui ter amplum / Geryonen Tityonque tristi / conpescit unda, scilicet omnibus / quicumque terrae munere vescimur / enaviganda, sive reges / sive inopes erimus coloni. / ' assert cur_phrase.snippet == 'frustra cruento Marte carebimus / fractisque rauci fluctibus Hadriae, / frustra per autumnos nocentem / corporibus metuemus Austrum: / ' break
def test_unitize(self, units): for unit in units: u = Unitizer() metadata = unit['metadata'] tess = TessFile(metadata.path, metadata=metadata) tokens = unit['tokens'] lines = unit['lines'] phrases = unit['phrases'] if metadata.language == 'greek': tokenizer = GreekTokenizer() elif metadata.language == 'latin': tokenizer = LatinTokenizer() tokenizer.clear() for i, line in enumerate(tess.readlines(include_tag=False)): stop = (i == len(tess) - 1) u.unitize(line, metadata, tokenizer=tokenizer, stop=stop) print(metadata.path) assert len(u.lines) == len(lines) for i in range(len(lines)): line_tokens = \ [tokenizer.tokens[j].form for j in u.lines[i].tokens if re.search(r'[\w\d]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in lines[i]['TOKEN_ID'] if 'FORM' in tokens[j] and tokens[j]['FORM']] if line_tokens != correct_tokens: print('Line {}'.format(i)) print(line_tokens) print(correct_tokens) assert line_tokens == correct_tokens print(u.phrases[-1].tokens) assert len(u.phrases) == len(phrases) for i in range(len(u.phrases)): phrase_tokens = \ [tokenizer.tokens[j].form for j in u.phrases[i].tokens if re.search(r'[\w\d]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in phrases[i]['TOKEN_ID'] if 'FORM' in tokens[j] and tokens[j]['FORM']] if phrase_tokens != correct_tokens: print('Phrase {}'.format(i)) phrase_tokens = \ [tokenizer.tokens[j].form for j in u.phrases[i - 1].tokens if re.search(r'[\w]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in phrases[i - 1]['TOKEN_ID'] if 'FORM' in tokens[j]] print(phrase_tokens) print(correct_tokens) assert phrase_tokens == correct_tokens assert len(u.phrases) == len(phrases) u.clear() tokenizer.clear()
def test_clear(self): u = Unitizer() vals = list(range(0, 100)) u.lines.extend(vals) u.clear() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == [] u.lines.extend(vals) u.phrases.extend(vals) u.clear() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == [] u.lines.extend(vals) u.phrases.extend(vals) u.clear() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == [] for i in [None, 'a', 1, 1.0, True, False, b'a', r'a']: u.lines = i u.clear() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == [] u.phrases = i u.clear() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == [] u.lines = i u.phrases = i u.clear() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == []
def test_init(self): u = Unitizer() assert hasattr(u, 'lines') assert u.lines == [] assert hasattr(u, 'phrases') assert u.phrases == []
def test_unitize(unitizer_inputs, correct_units): correct_lines = correct_units['lines'] correct_phrases = correct_units['phrases'] for i, indata in enumerate(unitizer_inputs): tokens, tags, features = indata feature_dict = {} for feature in features: if feature.feature in feature_dict: feature_dict[feature.feature][feature.index] = feature else: feature_dict[feature.feature] = {feature.index: feature} features = feature_dict unitizer = Unitizer() lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) text_correct_lines = correct_lines[i] assert len(lines) == len(text_correct_lines) for j, line in enumerate(lines): line_snippet = line.snippet assert WORD_PATTERN.search(line_snippet[0]) is not None assert not line_snippet.endswith(' / ') if isinstance(text_correct_lines[j]['locus'], str): assert line.tags[0] == text_correct_lines[j]['locus'] else: assert line.tags == text_correct_lines[j]['locus'] if len(line.tokens) != len(text_correct_lines[j]['tokens']): print( list( zip([t['display'] for t in line.tokens] + [''], [ t['display'] for t in text_correct_lines[j]['tokens'] ]))) assert len(line.tokens) == len(text_correct_lines[j]['tokens']) predicted = [ t for t in line.tokens if re.search(r'[\w]', t['display']) ] for k in range(len(predicted)): token = predicted[k] correct = text_correct_lines[j]['tokens'][k] assert token['display'] == correct['display'] if token['features']['form'][0] > -1: form = feature_dict['form'][token['features']['form'] [0]].token lemmata = [ feature_dict['lemmata'][l].token for l in token['features']['lemmata'] ] else: form = '' lemmata = [''] if form != correct['form']: print(token, correct) print(form, correct['form']) assert form == correct['form'] assert len(lemmata) == len(correct['stem']) assert all(map(lambda x: x in correct['stem'], lemmata)) text_correct_phrases = correct_phrases[i] assert len(phrases) == len(text_correct_phrases) for j, phrase in enumerate(phrases): assert WORD_PATTERN.search(phrase.snippet[0]) is not None if isinstance(text_correct_phrases[j]['locus'], str): assert phrase.tags[0] == text_correct_phrases[j]['locus'] else: assert phrase.tags == text_correct_phrases[j]['locus'] if len(phrase.tokens) != len(text_correct_phrases[j]['tokens']): print( list( zip([t['display'] for t in phrase.tokens] + [''], [ t['display'] for t in text_correct_phrases[j]['tokens'] ]))) assert len(phrase.tokens) == len(text_correct_phrases[j]['tokens']) predicted = [ t for t in phrase.tokens if re.search(r'[\w]', t['display']) ] for k in range(len(predicted)): token = predicted[k] correct = text_correct_phrases[j]['tokens'][k] assert token['display'] == correct['display'] if token['features']['form'][0] > -1: form = feature_dict['form'][token['features']['form'] [0]].token lemmata = [ feature_dict['lemmata'][l].token for l in token['features']['lemmata'] ] else: form = '' lemmata = [''] if form != correct['form']: print(token, correct) print(form, correct['form']) assert form == correct['form'] assert len(lemmata) == len(correct['stem']) assert all(map(lambda x: x in correct['stem'], lemmata))