def minipop(request, mini_greek_metadata, mini_latin_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'minitess') for metadata in mini_greek_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) for metadata in mini_latin_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn for coll_name in conn.connection.list_collection_names(): conn.connection.drop_collection(coll_name)
def minipop(request, mini_greek_metadata, mini_latin_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'minitess') conn.create_indices() for metadata in mini_greek_metadata: text = Text.json_decode(metadata) ingest_text(conn, text, enable_multitext=True) for metadata in mini_latin_metadata: text = Text.json_decode(metadata) ingest_text(conn, text, enable_multitext=True) yield conn obliterate(conn)
def unit_tessfiles(mini_greek_metadata, mini_latin_metadata): """Create text entities for the test texts. Fixtures -------- test_data A small set of sample texts and other entities. """ tessfiles = [] for metadata in mini_greek_metadata: tessfiles.append(Text.json_decode(metadata)) for metadata in mini_latin_metadata: tessfiles.append(Text.json_decode(metadata)) tessfiles.sort(key=lambda x: x.path) return tessfiles
def main(): args = parse_args() logger = build_logger(args.lfn, args.log) with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection(db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database']) conn.create_indices() with open(args.ingest) as ifh: texts = [Text.json_decode(t) for t in json.load(ifh)] for text in tqdm(texts): logger.info(f'Starting ingest: {text.author}\t{text.title}') try: ingest_text(conn, text) except KeyboardInterrupt: logger.info('KeyboardInterrupt') sys.exit(1) # we want to catch all other errors and log them except: # noqa: E722 logger.exception(f'Failed to ingest: {text.author}\t{text.title}') logger.exception(traceback.format_exc())
def test_retrieve_units(self, request, populate): conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None), db=conf.getoption('db_name', default=None)) m = DefaultMatcher(conn) for text in populate['texts']: text = Text.json_decode(text) start = -1 if text.language == 'latin': start = text.path.find('la/') if text.language == 'greek': start = text.path.find('grc/') if start >= 0: text.path = text.path[start:] correct = [ u for u in populate['units'] if u['text'] == text.path and u['unit_type'] == 'line' ] correct.sort(key=lambda x: x['index']) units = m.retrieve_units([text], 'line') assert len(units[0]) > 0 assert len(units[0]) == len(correct) for u in units[0]: assert u.json_encode() == correct[u.index]
def test_retrieve_frequencies(self, request, populate): conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None), db=conf.getoption('db_name', default=None)) m = DefaultMatcher(conn) for text in populate['texts']: text = Text.json_decode(text) start = -1 if text.language == 'latin': start = text.path.find('la/') if text.language == 'greek': start = text.path.find('grc/') if start >= 0: text.path = text.path[start:] tokens = [t for t in populate['tokens'] if t['text'] == text.path] correct = [ f for f in populate['frequencies'] if f['text'] == text.path ] frequencies, _ = m.retrieve_frequencies([text], tokens, [text]) assert len(frequencies) > 0 assert len(frequencies) == len(correct) for c in correct: assert c['form'] in frequencies
def populate_database(search_connection, test_data): """Set up the database to conduct searches on the test texts. Fixtures -------- search_connection TessMongoConnection for search unit tests. test_data Example data for unit testing. """ for text in test_data['texts']: tessfile = TessFile(text['path'], metadata=Text(**text)) search_connection.insert(tessfile.metadata) if text['language'] == 'latin': tok = LatinTokenizer(search_connection) unitizer = Unitizer() tokens, tags, features = tok.tokenize(tessfile.read(), text=tessfile.metadata) search_connection.update(features) lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata) search_connection.insert(lines + phrases) search_connection.insert(tokens) yield search_connection.connection['texts'].delete_many({}) search_connection.connection['tokens'].delete_many({}) search_connection.connection['features'].delete_many({}) search_connection.connection['units'].delete_many({}) search_connection.connection['matches'].delete_many({}) search_connection.connection['searches'].delete_many({})
def engpop(request, eng_metadata, v3checker): conn = TessMongoConnection('localhost', 27017, None, None, 'engtest') for metadata in eng_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn obliterate(conn)
def g2lpop(request, mini_g2l_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'g2ltest') for metadata in mini_g2l_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn obliterate(conn)
def lucvergpop(request, lucverg_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'lucvergtest') for metadata in lucverg_metadata: text = Text.json_decode(metadata) tessfile = TessFile(text.path, metadata=text) conn.insert(text) tokens, tags, features = \ LatinTokenizer(conn).tokenize( tessfile.read(), text=tessfile.metadata) feature_cache = { (f.feature, f.token): f for f in conn.find(Feature.collection, language=text.language) } features_for_insert = [] features_for_update = [] for f in features: if (f.feature, f.token) not in feature_cache: features_for_insert.append(f) feature_cache[(f.feature, f.token)] = f else: f.id = feature_cache[(f.feature, f.token)].id features_for_update.append(f) conn.insert(features_for_insert) conn.update(features_for_update) unitizer = Unitizer() lines, _ = unitizer.unitize(tokens, tags, tessfile.metadata) conn.insert_nocheck(lines) yield conn obliterate(conn)
def main(): """Ingest a text into Tesserae. Takes a .tess files and computes tokens, features, frequencies, and units. All computed components are inserted into the database. """ args = parse_args() if args.password: password = getpass.getpass(prompt='Tesserae MongoDB Password: ') else: password = None connection = TessMongoConnection(args.host, args.port, args.user, password, db=args.database) text = Text(language=args.language, title=args.title, author=args.author, year=args.year, path=args.input, is_prose=args.prose) ingest_text(connection, text, enable_multitext=args.enable_multitext)
def test_unitize_elision_file(unit_connection, tessfiles_greek_path): tokenizer = GreekTokenizer(unit_connection) t = Text(path=str(tessfiles_greek_path.joinpath('test.elision.tess')), language='greek') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1
def test_unitize_notag_file(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str(tessfiles_latin_path.joinpath('test.notag.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1
def test_match(self, request, populate, reference_matches): conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None), db=conf.getoption('db_name', default=None)) for t in populate['texts']: start = -1 if t['language'] == 'latin': start = t['path'].find('la/') if t['language'] == 'greek': start = t['path'].find('grc/') if start > 0: t['path'] = t['path'][start:] m = DefaultMatcher(conn) for ref in reference_matches: metadata = ref[0] correct = ref[1] source = [ t for t in populate['texts'] if re.search(metadata['source'], t['path']) ] target = [ t for t in populate['texts'] if re.search(metadata['target'], t['path']) ] texts = [Text.json_decode(source[0]), Text.json_decode(target[0])] matches = m.match(texts, metadata['unit'], metadata['feature'], stopwords=metadata['stopsize'], stopword_basis=metadata['stbasis'], score_basis=metadata['scorebase'], frequency_basis=metadata['freqbasis'], max_distance=metadata['max_dist'], distance_metric=metadata['dibasis']) print(matches) assert len(matches) == len(correct)
def test_unitize_linebreak_file(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str(tessfiles_latin_path.joinpath('test.linebreak.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1 first_tag = phrases[0].tags[0] for phrase in phrases[1:]: assert phrase.tags[0] == first_tag
def test_unitize_diacrit_in_latin(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.diacrit_in_latin.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) forms = {f.index: f.token for f in features if f.feature == 'form'} lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) for phrase in phrases: for t in phrase.tokens: cur_form = t['features']['form'][0] if cur_form != -1: normalized = tokenizer.normalize(t['display'])[0][0] assert normalized == forms[cur_form], phrase.snippet
def test_unitize_linebreak_end(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.linebreak_end.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) print('# lines') for line in lines: print(line.snippet) print('# phrases') for phrase in phrases: print(phrase.snippet) assert len(lines) == 2
def main(): args = parse_args() if args.password: password = getpass.getpass(prompt='Tesserae MongoDB Password: '******'utf-8') as ifh: raw_updates = json.load(ifh) connection.update([Text.json_decode(t) for t in raw_updates])
def test_unitize_nopunctuation_file(unit_connection, tessfiles_latin_path): # when there is no ending punctuation despite coming to the end of a poem # and another poem starts after a blank line tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.nopunctuation.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 68 for prev_phrase, cur_phrase in zip(phrases[:-1], phrases[1:]): if '2.13' in prev_phrase.tags[0] and '2.14' in cur_phrase.tags[0]: assert prev_phrase.snippet == 'quin et Prometheus et Pelopis parens / dulci laborem decipitur sono / nec curat Orion leones / aut timidos agitare lyncas / Eheu fugaces, Postume, Postume, / labuntur anni nec pietas moram / rugis et instanti senectae / adferet indomitaeque morti, / non, si trecenis quotquot eunt dies, / amice, places inlacrimabilem / Plutona tauris, qui ter amplum / Geryonen Tityonque tristi / conpescit unda, scilicet omnibus / quicumque terrae munere vescimur / enaviganda, sive reges / sive inopes erimus coloni. / ' assert cur_phrase.snippet == 'frustra cruento Marte carebimus / fractisque rauci fluctibus Hadriae, / frustra per autumnos nocentem / corporibus metuemus Austrum: / ' break
def main(): args = parse_args() logger = build_logger(args.lfn, args.log) with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection(db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database']) with open(args.reingest) as ifh: texts = [] for line in ifh: line = line.strip() if line: items = line.split('\t') texts.append(Text(author=items[0], title=items[1])) texts = conn.aggregate(Text.collection, [{ '$match': { '$or': [{ 'author': t.author, 'title': t.title } for t in texts] } }]) for text in tqdm(texts): logger.info(f'Starting reingest: {text.author}\t{text.title}') try: reingest_text(conn, text) except KeyboardInterrupt: logger.info('KeyboardInterrupt') sys.exit(1) # we want to catch all other errors and log them except: # noqa: E722 logger.exception( f'Failed to reingest: {text.author}\t{text.title}') logger.exception(traceback.format_exc())
def units(tessfiles): data = [] for root, dirs, files in os.walk(tessfiles): if 'new' not in root and re.search(r'poetry|prose', root): fdata = {} for fname in files: parts = fname.split('.') if '.tess' in fname: metadata = Text( title=parts[0], author=parts[1], language='greek' if 'grc' in root else 'latin', path=os.path.join(root, fname)) fdata['metadata'] = metadata if '.json' in fname: feature = parts[2] with open(os.path.join(root, fname), 'r') as f: fdata[feature] = json.load(f) data.append(fdata) return data
def main(): args = parse_args() logger = build_logger(args.lfn, args.log) with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection( db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database'] ) with open(args.ingest) as ifh: texts = [Text.json_decode(t) for t in json.load(ifh)] for text in tqdm(texts): logger.info(f'Starting ingest: {text.author}\t{text.title}') try: ingest_text(conn, text) except KeyboardInterrupt: logger.info('KeyboardInterrupt') sys.exit(1) except: logger.exception(f'Failed to ingest: {text.author}\t{text.title}')
def main(): """Ingest a text into Tesserae. Takes a .tess files and computes tokens, features, frequencies, and units. All computed components are inserted into the database. """ args = parse_args() if args.password: password = getpass(prompt='Tesserae MongoDB Password: ') else: password = None connection = TessMongoConnection( args.host, args.port, args.user, password, db=args.database) text_hash = hashlib.md5() text_hash.update(TessFile(args.input).read().encode()) text_hash = text_hash.hexdigest() text = Text(language=args.language, title=args.title, author=args.author, year=args.year, path=args.input, hash=text_hash, is_prose=args.prose) ingest_text(connection, text)
def insert_text(connection, cts_urn, language, author, title, year, unit_types, path): """Insert a new text into the database. Attempt to insert a new text in the database, sanitized to match the fields and data types of existing texts. Parameters ---------- cts_urn : str Unique collection-level identifier. language : str Language the text is written in. author : str Full name of the text author. title : str Title of the text. year : int Year of text authorship. unit_types : str or list of str Valid unit-level delimiters for this text. path : str Path to the raw text file. May be a remote URL. Returns ------- result : `pymongo.InsertOneResult` The Raises ------ TextExistsError Raised when attempting to insert a text that already exists in the database. Notes ----- This function should not be made available to everyone. To properly secure the database, ensure that only MongoDB users NOT connected to a public- facing client application are able to write to the database. See the <MongoDB documentation on role-based access control>_ for more information. .. _MongoDB documentation on role-based access control: https://docs.mongodb.com/manual/core/authorization/ """ # Attempt to load the file and any database entry with the same CTS URN text_file = TessFile(path) db_texts = retrieve_text_list(connection, cts_urn=cts_urn, hash=text_file.hash) # If no entries with the same CTS URN were found in the database, insert. # Otherwise, raise an exception. if len(db_texts) == 0: text = Text(cts_urn=cts_urn, language=language, author=author, title=title, year=year, unit_types=unit_types, path=path, hash=text_file.hash) result = connection.texts.insert_one(text.json_encode(exclude=['_id'])) return result else: raise TextExistsError(cts_urn, hash)