def main(): args = parse_args() logger = build_logger(args.lfn, args.log) with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection(db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database']) conn.create_indices() with open(args.ingest) as ifh: texts = [Text.json_decode(t) for t in json.load(ifh)] for text in tqdm(texts): logger.info(f'Starting ingest: {text.author}\t{text.title}') try: ingest_text(conn, text) except KeyboardInterrupt: logger.info('KeyboardInterrupt') sys.exit(1) # we want to catch all other errors and log them except: # noqa: E722 logger.exception(f'Failed to ingest: {text.author}\t{text.title}') logger.exception(traceback.format_exc())
def main(): args = parse_args() logger = build_logger(args.lfn, args.log) with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection(db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database']) os.environ['HOME'] = args.home from tesserae.utils.multitext import register_bigrams, MULTITEXT_SEARCH texts = conn.find(Text.collection) for text in tqdm(texts): if needs_multitext_enabled(text): logger.info(f'Extracting bigrams: {text.author}\t{text.title}') try: register_bigrams(conn, text) except KeyboardInterrupt: logger.info('KeyboardInterrupt') sys.exit(1) # we want to catch all other errors and log them except: # noqa: E722 logger.exception(f'Failed: {text.author}\t{text.title}') logger.exception(traceback.format_exc())
def minipop(request, mini_greek_metadata, mini_latin_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'minitess') conn.create_indices() for metadata in mini_greek_metadata: text = Text.json_decode(metadata) ingest_text(conn, text, enable_multitext=True) for metadata in mini_latin_metadata: text = Text.json_decode(metadata) ingest_text(conn, text, enable_multitext=True) yield conn obliterate(conn)
def minipop(request, mini_greek_metadata, mini_latin_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'minitess') conn.create_indices() for metadata in mini_greek_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) for metadata in mini_latin_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn for coll_name in conn.connection.list_collection_names(): conn.connection.drop_collection(coll_name)
def main(): args = parse_args() with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection(db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database']) conn.create_indices()
def test_retrieve_units(self, request, populate): conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None), db=conf.getoption('db_name', default=None)) m = DefaultMatcher(conn) for text in populate['texts']: text = Text.json_decode(text) start = -1 if text.language == 'latin': start = text.path.find('la/') if text.language == 'greek': start = text.path.find('grc/') if start >= 0: text.path = text.path[start:] correct = [ u for u in populate['units'] if u['text'] == text.path and u['unit_type'] == 'line' ] correct.sort(key=lambda x: x['index']) units = m.retrieve_units([text], 'line') assert len(units[0]) > 0 assert len(units[0]) == len(correct) for u in units[0]: assert u.json_encode() == correct[u.index]
def test_retrieve_frequencies(self, request, populate): conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None), db=conf.getoption('db_name', default=None)) m = DefaultMatcher(conn) for text in populate['texts']: text = Text.json_decode(text) start = -1 if text.language == 'latin': start = text.path.find('la/') if text.language == 'greek': start = text.path.find('grc/') if start >= 0: text.path = text.path[start:] tokens = [t for t in populate['tokens'] if t['text'] == text.path] correct = [ f for f in populate['frequencies'] if f['text'] == text.path ] frequencies, _ = m.retrieve_frequencies([text], tokens, [text]) assert len(frequencies) > 0 assert len(frequencies) == len(correct) for c in correct: assert c['form'] in frequencies
def main(): args = parse_args() if args.password: password = getpass.getpass(prompt='Tesserae MongoDB Password: '******'utf-8') as ifh: raw_updates = json.load(ifh) connection.update([Text.json_decode(t) for t in raw_updates])
def main(): args = parse_args() logger = build_logger(args.lfn, args.log) with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection(db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database']) cutoff = datetime.datetime.utcnow() - datetime.timedelta(days=7) for_deletion = [ Search.json_decode(s) for s in conn.connection[Search.collection].find( # https://stackoverflow.com/questions/11957595/mongodb-pymongo-query-with-datetime {'last_queried': { '$lt': cutoff }}) ] logger.info('Number of Search entities out of date: {}'.format( len(for_deletion))) try: remove_results(conn, for_deletion) except KeyboardInterrupt: logger.info('KeyboardInterrupt') sys.exit(1) # we want to catch all other errors and log them except: # noqa: E722 logger.exception('Failed to delete out of date Search entities') logger.exception(traceback.format_exc())
def engpop(request, eng_metadata, v3checker): conn = TessMongoConnection('localhost', 27017, None, None, 'engtest') for metadata in eng_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn obliterate(conn)
def main(): """Ingest a text into Tesserae. Takes a .tess files and computes tokens, features, frequencies, and units. All computed components are inserted into the database. """ args = parse_args() if args.password: password = getpass.getpass(prompt='Tesserae MongoDB Password: ') else: password = None connection = TessMongoConnection(args.host, args.port, args.user, password, db=args.database) text = Text(language=args.language, title=args.title, author=args.author, year=args.year, path=args.input, is_prose=args.prose) ingest_text(connection, text, enable_multitext=args.enable_multitext)
def g2lpop(request, mini_g2l_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'g2ltest') for metadata in mini_g2l_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn obliterate(conn)
def removedb(mini_latin_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'removedb') for metadata in mini_latin_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn for coll_name in conn.connection.list_collection_names(): conn.connection.drop_collection(coll_name)
def punctpop(request, mini_punctuation_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'minitess') for metadata in mini_punctuation_metadata: text = Text.json_decode(metadata) ingest_text(conn, text) yield conn for coll_name in conn.connection.list_collection_names(): conn.connection.drop_collection(coll_name)
def await_job(self, db_cred, queue): """Waits for search job""" connection = TessMongoConnection(**db_cred) while True: results_id, search_type, search_params = queue.get(block=True) if results_id is None: break self.run_search(connection, results_id, search_type, search_params)
def await_job(self, db_cred, queue): """Waits for job""" connection = TessMongoConnection(**db_cred) while True: instructions, kwargs = queue.get(block=True) if instructions is None: break instructions(connection, **kwargs)
def test_init(self, request): # Test creating a TessMongoConnection for the test database without # database name conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None)) m = DefaultMatcher(conn) assert isinstance(m.connection.connection, pymongo.database.Database) assert m.connection.connection.client.address == \ (conf.getoption('db_host'), conf.getoption('db_port')) assert m.connection.connection.name == 'tesserae' assert m.matches == [] # Test getting a MongoClient for the test database with database name conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None), db=conf.getoption('db_name', default=None)) m = DefaultMatcher(conn) assert isinstance(m.connection.connection, pymongo.database.Database) assert m.connection.connection.client.address == \ (conf.getoption('db_host'), conf.getoption('db_port')) assert m.connection.connection.name == 'tess_test' assert m.matches == [] # Test getting a MongoClient for the test database with database name conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None), db='foobar') m = DefaultMatcher(conn) assert isinstance(m.connection.connection, pymongo.database.Database) assert m.connection.connection.client.address == \ (conf.getoption('db_host'), conf.getoption('db_port')) assert m.connection.connection.name == 'foobar' assert m.matches == []
def main(): """Delete a text from Tesserae""" args = parse_args() if args.password: password = getpass.getpass(prompt='Tesserae MongoDB Password: '******'Could not find text with ID {args.text_id}') remove_text(connection, found[0])
def main(): args = parse_args() logger = build_logger(args.lfn, args.log) with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection(db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database']) with open(args.reingest) as ifh: texts = [] for line in ifh: line = line.strip() if line: items = line.split('\t') texts.append(Text(author=items[0], title=items[1])) texts = conn.aggregate(Text.collection, [{ '$match': { '$or': [{ 'author': t.author, 'title': t.title } for t in texts] } }]) for text in tqdm(texts): logger.info(f'Starting reingest: {text.author}\t{text.title}') try: reingest_text(conn, text) except KeyboardInterrupt: logger.info('KeyboardInterrupt') sys.exit(1) # we want to catch all other errors and log them except: # noqa: E722 logger.exception( f'Failed to reingest: {text.author}\t{text.title}') logger.exception(traceback.format_exc())
def main(): args = parse_args() logger = build_logger(args.lfn, args.log) with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection(db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database']) try: logger.info('Indexing database') conn.create_indices() logger.info('Registering Greek to Latin Lemmata') register_greek_to_latin_lemmata(conn) except KeyboardInterrupt: logger.info('KeyboardInterrupt') sys.exit(1) # we want to catch all other errors and log them except: # noqa: E722 logger.exception('Failed initial set up of database') logger.exception(traceback.format_exc())
def search_connection(request): """Create a new TessMongoConnection for this task. Fixtures -------- request The configuration to connect to the MongoDB test server. """ conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None), db=conf.getoption('db_name', default=None)) return conn
def resultsdb(): conn = TessMongoConnection('localhost', 27017, None, None, 'resultdb') results_id = uuid.uuid4() search_result = Search(results_id=results_id, search_type=NORMAL_SEARCH, status=Search.DONE) conn.insert(search_result) conn.insert([_create_match(search_result) for _ in range(100)]) yield conn for coll_name in conn.connection.list_collection_names(): conn.connection.drop_collection(coll_name)
def test_match(self, request, populate, reference_matches): conf = request.config conn = TessMongoConnection(conf.getoption('db_host'), conf.getoption('db_port'), conf.getoption('db_user'), password=conf.getoption('db_passwd', default=None), db=conf.getoption('db_name', default=None)) for t in populate['texts']: start = -1 if t['language'] == 'latin': start = t['path'].find('la/') if t['language'] == 'greek': start = t['path'].find('grc/') if start > 0: t['path'] = t['path'][start:] m = DefaultMatcher(conn) for ref in reference_matches: metadata = ref[0] correct = ref[1] source = [ t for t in populate['texts'] if re.search(metadata['source'], t['path']) ] target = [ t for t in populate['texts'] if re.search(metadata['target'], t['path']) ] texts = [Text.json_decode(source[0]), Text.json_decode(target[0])] matches = m.match(texts, metadata['unit'], metadata['feature'], stopwords=metadata['stopsize'], stopword_basis=metadata['stbasis'], score_basis=metadata['scorebase'], frequency_basis=metadata['freqbasis'], max_distance=metadata['max_dist'], distance_metric=metadata['dibasis']) print(matches) assert len(matches) == len(correct)
def main(): """Look for a text in the Tesserae database""" args = parse_args() if args.password: password = getpass.getpass(prompt='Tesserae MongoDB Password: '******'title'] = args.title if args.author: kwargs['author'] = args.author if args.language: kwargs['language'] = args.language pprint([t for t in connection.connection[Text.collection].find(kwargs)])
def main(): """Ingest a text into Tesserae. Takes a .tess files and computes tokens, features, frequencies, and units. All computed components are inserted into the database. """ args = parse_args() if args.password: password = getpass(prompt='Tesserae MongoDB Password: ') else: password = None connection = TessMongoConnection( args.host, args.port, args.user, password, db=args.database) text_hash = hashlib.md5() text_hash.update(TessFile(args.input).read().encode()) text_hash = text_hash.hexdigest() text = Text(language=args.language, title=args.title, author=args.author, year=args.year, path=args.input, hash=text_hash, is_prose=args.prose) ingest_text(connection, text)
def main(): args = parse_args() logger = build_logger(args.lfn, args.log) with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection( db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database'] ) with open(args.ingest) as ifh: texts = [Text.json_decode(t) for t in json.load(ifh)] for text in tqdm(texts): logger.info(f'Starting ingest: {text.author}\t{text.title}') try: ingest_text(conn, text) except KeyboardInterrupt: logger.info('KeyboardInterrupt') sys.exit(1) except: logger.exception(f'Failed to ingest: {text.author}\t{text.title}')
def main(): """Perform Tesserae search and display the top 10 results""" args = parse_args() if args.password: password = getpass(prompt='Tesserae MongoDB Password: '******'-', ' ') source_title = args.source_title.lower().replace('-', ' ') source = TextOptions(text=connection.find('texts', author=source_author, title=source_title)[0], unit_type=args.source_unit) target_author = args.target_author.lower().replace('_', ' ') target_title = args.target_title.lower().replace('_', ' ') target = TextOptions(text=connection.find('texts', author=target_author, title=target_title)[0], unit_type=args.target_unit) start = time.time() stopword_indices = create_stoplist( connection, args.n_stopwords, args.feature, source.text.language, basis='corpus' if args.stopword_basis == 'corpus' else [source.text.id, target.text.id]) stopword_tokens = get_stoplist_tokens(connection, stopword_indices, args.feature, source.text.language) parameters = { 'source': { 'object_id': str(source.text.id), 'units': source.unit_type }, 'target': { 'object_id': str(target.text.id), 'units': target.unit_type }, 'method': { 'name': SparseMatrixSearch.matcher_type, 'feature': args.feature, 'stopwords': stopword_tokens, 'freq_basis': args.freq_basis, 'max_distance': args.max_distance, 'distance_basis': args.distance_basis } } results_id = check_cache(connection, parameters['source'], parameters['target'], parameters['method']) if results_id: print('Cached results found.') search = connection.find(Search.collection, results_id=results_id, search_type=NORMAL_SEARCH)[0] else: search = Search(results_id=uuid.uuid4().hex, search_type=NORMAL_SEARCH, parameters=parameters) connection.insert(search) search_params = { 'source': source, 'target': target, 'feature': parameters['method']['feature'], 'stopwords': parameters['method']['stopwords'], 'freq_basis': parameters['method']['freq_basis'], 'max_distance': parameters['method']['max_distance'], 'distance_basis': parameters['method']['distance_basis'], 'min_score': 0 } _run_search(connection, search, SparseMatrixSearch.matcher_type, search_params) matches = get_results(connection, search.id, PageOptions()) end = time.time() - start matches.sort(key=lambda x: x['score'], reverse=True) print(f'Search found {len(matches)} matches in {end}s.') display_count = 10 if len(matches) >= 10 else len(matches) print(f'The Top {display_count} Matches') print('------------------') print() print("Result\tScore\tSource Locus\tTarget Locus\tShared") for i, m in enumerate(matches[:10]): shared = m['matched_features'] print(f'{i}.\t{m["score"]}\t{m["source_tag"]}\t{m["target_tag"]}\t' f'{[t for t in shared]}')
def lucvergpop(request, lucverg_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'lucvergtest') for metadata in lucverg_metadata: text = Text.json_decode(metadata) tessfile = TessFile(text.path, metadata=text) conn.insert(text) tokens, tags, features = \ LatinTokenizer(conn).tokenize( tessfile.read(), text=tessfile.metadata) feature_cache = { (f.feature, f.token): f for f in conn.find(Feature.collection, language=text.language) } features_for_insert = [] features_for_update = [] for f in features: if (f.feature, f.token) not in feature_cache: features_for_insert.append(f) feature_cache[(f.feature, f.token)] = f else: f.id = feature_cache[(f.feature, f.token)].id features_for_update.append(f) conn.insert(features_for_insert) conn.update(features_for_update) unitizer = Unitizer() lines, _ = unitizer.unitize(tokens, tags, tessfile.metadata) conn.insert_nocheck(lines) yield conn obliterate(conn)
filepath : str, optional The file to write. If not provided, the contents will be written to `sys.stdout`. delimiter : str, optional The column delimiter for CSV-like files. Only used when ``format`` is 'csv'. """ export(connection, search_id, file_format, filepath=None, delimiter=',') if __name__ == '__main__': args = parse_args() if args.password: password = getpass.getpass(prompt='Tesserae MongoDB Password: ') else: password = None connection = TessMongoConnection(args.host, args.port, args.user, password, db=args.database) search_id = ObjectId(args.search) main(connection, search_id, args.format, filepath=args.path, delimiter=args.delimiter)