def test_latin_trigrams(minipop, mini_latin_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_latin_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) v5_results = [] v3_results = [] raw_v5_results = [] target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound') for b in target_units: raw_v5_results.append(b['features']) raw_v3_results = _load_v3_results(texts[0].path, 'mini_latin_results_3gr.tab') for a in raw_v3_results: v3_results.append(a['matched_features']) print('v5 results:') for a in raw_v5_results: print(a) for n in a: print(n) n = np.asarray(n) print('array', n) print(np.shape(n)) b = get_stoplist_tokens(minipop, n, 'sound', 'latin') v5_results.append(b) print(v5_results) print('v3 results:') for a in v3_results: print(a) print('v5 length:', len(v5_results), 'v3 length:', len(v3_results)) assert False
def test_greek_trigrams(minipop, mini_greek_metadata): """ For the purpose of visualization. Use to confirm that trigrams are being stored in the database correctly. It should be noted that v5 results do not have stopwords filtered out, while v3 results probably do. """ texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) v5_results = [] v3_results = [] raw_v5_results = [] target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound') for b in target_units: raw_v5_results.append(b['features']) raw_v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab') for a in raw_v3_results: v3_results.append(a['matched_features']) print('v5 results:') for a in raw_v5_results: print(a) for n in a: # print(n) n = np.asarray(n) # print('array',n) # print('shape', np.shape(n)) b = get_stoplist_tokens(minipop, n, 'sound', 'greek') v5_results.append(b) print(v5_results) print('v3 results:') for a in v3_results: print(a) print('v5 length:', len(v5_results), 'v3 length:', len(v3_results)) assert False
def main(): """Perform Tesserae search and display the top 10 results""" args = parse_args() if args.password: password = getpass(prompt='Tesserae MongoDB Password: '******'-', ' ') source_title = args.source_title.lower().replace('-', ' ') source = TextOptions(text=connection.find('texts', author=source_author, title=source_title)[0], unit_type=args.source_unit) target_author = args.target_author.lower().replace('_', ' ') target_title = args.target_title.lower().replace('_', ' ') target = TextOptions(text=connection.find('texts', author=target_author, title=target_title)[0], unit_type=args.target_unit) start = time.time() stopword_indices = create_stoplist( connection, args.n_stopwords, args.feature, source.text.language, basis='corpus' if args.stopword_basis == 'corpus' else [source.text.id, target.text.id]) stopword_tokens = get_stoplist_tokens(connection, stopword_indices, args.feature, source.text.language) parameters = { 'source': { 'object_id': str(source.text.id), 'units': source.unit_type }, 'target': { 'object_id': str(target.text.id), 'units': target.unit_type }, 'method': { 'name': SparseMatrixSearch.matcher_type, 'feature': args.feature, 'stopwords': stopword_tokens, 'freq_basis': args.freq_basis, 'max_distance': args.max_distance, 'distance_basis': args.distance_basis } } results_id = check_cache(connection, parameters['source'], parameters['target'], parameters['method']) if results_id: print('Cached results found.') search = connection.find(Search.collection, results_id=results_id, search_type=NORMAL_SEARCH)[0] else: search = Search(results_id=uuid.uuid4().hex, search_type=NORMAL_SEARCH, parameters=parameters) connection.insert(search) search_params = { 'source': source, 'target': target, 'feature': parameters['method']['feature'], 'stopwords': parameters['method']['stopwords'], 'freq_basis': parameters['method']['freq_basis'], 'max_distance': parameters['method']['max_distance'], 'distance_basis': parameters['method']['distance_basis'], 'min_score': 0 } _run_search(connection, search, SparseMatrixSearch.matcher_type, search_params) matches = get_results(connection, search.id, PageOptions()) end = time.time() - start matches.sort(key=lambda x: x['score'], reverse=True) print(f'Search found {len(matches)} matches in {end}s.') display_count = 10 if len(matches) >= 10 else len(matches) print(f'The Top {display_count} Matches') print('------------------') print() print("Result\tScore\tSource Locus\tTarget Locus\tShared") for i, m in enumerate(matches[:10]): shared = m['matched_features'] print(f'{i}.\t{m["score"]}\t{m["source_tag"]}\t{m["target_tag"]}\t' f'{[t for t in shared]}')
def query_stopwords(): """Build a stopwords list""" if len(flask.request.args) == 0: # default response when no arguments are given return flask.jsonify({'stopwords': []}) feature = flask.request.args.get('feature', 'lemmata') list_size = flask.request.args.get('list_size', 10) try: list_size = int(list_size) except ValueError: return apitess.errors.error( 400, data={k: v for k, v in flask.request.args.items()}, message='"list_size" must be an integer') # language takes precedence over works language = flask.request.args.get('language', None) if language: stopword_indices = create_stoplist(flask.g.db, list_size, feature, language) if len(stopword_indices) == 0: return apitess.errors.error( 400, data={k: v for k, v in flask.request.args.items()}, message='No stopwords found for feature "{}" in language "{}".' .format(feature, language)) return flask.jsonify({ 'stopwords': get_stoplist_tokens(flask.g.db, stopword_indices, feature, language) }) works = flask.request.args.get('works', None) if works: oids, fails = apitess.utils.parse_works_arg(works) if fails: return apitess.errors.bad_object_ids(fails, flask.request.args) text_results = flask.g.db.find(tesserae.db.entities.Text.collection, _id=oids) if len(text_results) != len(oids): # figure out which works were not found in the database and report found = {str(r.id) for r in text_results} not_found = [] for obj_id in oids: if obj_id not in found: not_found.append(obj_id) return apitess.errors.error( 400, data={k: v for k, v in flask.request.args.items()}, message=('The following works could not be found ' f'in the database: {not_found}')) stopword_indices = create_stoplist( flask.g.db, list_size, feature, text_results[0].language, basis=[str(t.id) for t in text_results]) return flask.jsonify({ 'stopwords': get_stoplist_tokens(flask.g.db, stopword_indices, feature, language) }) # if we get here, then we didn't get enough information return apitess.errors.error( 400, data={k: v for k, v in flask.request.args.items()}, message=( 'Insufficient information was given to calculate a stopwords ' 'list (Perhaps you forgot to specify "language" or "works").'))