def lucvergpop(request, lucverg_metadata): conn = TessMongoConnection('localhost', 27017, None, None, 'lucvergtest') for metadata in lucverg_metadata: text = Text.json_decode(metadata) tessfile = TessFile(text.path, metadata=text) conn.insert(text) tokens, tags, features = \ LatinTokenizer(conn).tokenize( tessfile.read(), text=tessfile.metadata) feature_cache = { (f.feature, f.token): f for f in conn.find(Feature.collection, language=text.language) } features_for_insert = [] features_for_update = [] for f in features: if (f.feature, f.token) not in feature_cache: features_for_insert.append(f) feature_cache[(f.feature, f.token)] = f else: f.id = feature_cache[(f.feature, f.token)].id features_for_update.append(f) conn.insert(features_for_insert) conn.update(features_for_update) unitizer = Unitizer() lines, _ = unitizer.unitize(tokens, tags, tessfile.metadata) conn.insert_nocheck(lines) yield conn obliterate(conn)
def main(): args = parse_args() logger = build_logger(args.lfn, args.log) with open(args.db_cred) as ifh: db_cred = json.load(ifh) conn = TessMongoConnection(db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'], db=db_cred['database']) os.environ['HOME'] = args.home from tesserae.utils.multitext import register_bigrams, MULTITEXT_SEARCH texts = conn.find(Text.collection) for text in tqdm(texts): if needs_multitext_enabled(text): logger.info(f'Extracting bigrams: {text.author}\t{text.title}') try: register_bigrams(conn, text) except KeyboardInterrupt: logger.info('KeyboardInterrupt') sys.exit(1) # we want to catch all other errors and log them except: # noqa: E722 logger.exception(f'Failed: {text.author}\t{text.title}') logger.exception(traceback.format_exc())
def main(): """Delete a text from Tesserae""" args = parse_args() if args.password: password = getpass.getpass(prompt='Tesserae MongoDB Password: '******'Could not find text with ID {args.text_id}') remove_text(connection, found[0])
def main(): """Perform Tesserae search and display the top 10 results""" args = parse_args() if args.password: password = getpass(prompt='Tesserae MongoDB Password: '******'-', ' ') source_title = args.source_title.lower().replace('-', ' ') source = TextOptions(text=connection.find('texts', author=source_author, title=source_title)[0], unit_type=args.source_unit) target_author = args.target_author.lower().replace('_', ' ') target_title = args.target_title.lower().replace('_', ' ') target = TextOptions(text=connection.find('texts', author=target_author, title=target_title)[0], unit_type=args.target_unit) start = time.time() stopword_indices = create_stoplist( connection, args.n_stopwords, args.feature, source.text.language, basis='corpus' if args.stopword_basis == 'corpus' else [source.text.id, target.text.id]) stopword_tokens = get_stoplist_tokens(connection, stopword_indices, args.feature, source.text.language) parameters = { 'source': { 'object_id': str(source.text.id), 'units': source.unit_type }, 'target': { 'object_id': str(target.text.id), 'units': target.unit_type }, 'method': { 'name': SparseMatrixSearch.matcher_type, 'feature': args.feature, 'stopwords': stopword_tokens, 'freq_basis': args.freq_basis, 'max_distance': args.max_distance, 'distance_basis': args.distance_basis } } results_id = check_cache(connection, parameters['source'], parameters['target'], parameters['method']) if results_id: print('Cached results found.') search = connection.find(Search.collection, results_id=results_id, search_type=NORMAL_SEARCH)[0] else: search = Search(results_id=uuid.uuid4().hex, search_type=NORMAL_SEARCH, parameters=parameters) connection.insert(search) search_params = { 'source': source, 'target': target, 'feature': parameters['method']['feature'], 'stopwords': parameters['method']['stopwords'], 'freq_basis': parameters['method']['freq_basis'], 'max_distance': parameters['method']['max_distance'], 'distance_basis': parameters['method']['distance_basis'], 'min_score': 0 } _run_search(connection, search, SparseMatrixSearch.matcher_type, search_params) matches = get_results(connection, search.id, PageOptions()) end = time.time() - start matches.sort(key=lambda x: x['score'], reverse=True) print(f'Search found {len(matches)} matches in {end}s.') display_count = 10 if len(matches) >= 10 else len(matches) print(f'The Top {display_count} Matches') print('------------------') print() print("Result\tScore\tSource Locus\tTarget Locus\tShared") for i, m in enumerate(matches[:10]): shared = m['matched_features'] print(f'{i}.\t{m["score"]}\t{m["source_tag"]}\t{m["target_tag"]}\t' f'{[t for t in shared]}')