def index(self, source_path, source_reader, index_location, index_type, features_location=None, db_location='localhost', db_name=None, db_type='mongo', limit=None): try: reader = Reader.factory(source_path, source_reader, features_location=features_location, limit=limit) loop = asyncio.get_event_loop() try: index = Index.factory(reader, index_location, index_type, loop) if db_location and db_name and db_type: db = Database.factory(db_location, db_name, db_type, loop) loop.run_until_complete( db.store( index.index(features_location=features_location))) else: loop.run_until_complete( Index.no_store( index.index(features_location=features_location))) finally: loop.run_until_complete(loop.shutdown_asyncgens()) loop.close() except ArmyAntException as e: logger.error(e)
async def autocomplete(request): engine = request.GET.get('engine') if engine is None: engine = list(request.app['engines'].keys())[0] substring = request.GET.get('substring') error = None if substring: try: loop = asyncio.get_event_loop() index = Index.open( request.app['engines'][engine]['index']['location'], request.app['engines'][engine]['index']['type'], loop) matches = await index.autocomplete(substring) except (ArmyAntException, ClientOSError) as e: error = e else: matches = [] if error: response = {'error': str(error)} else: response = {'matches': matches} return web.json_response( response, dumps=lambda obj: json.dumps(obj, default=to_serializable))
def __init__(self, task, eval_location, query_type, retrieval_task): super().__init__(task, eval_location) self.loop = asyncio.get_event_loop() self.index = Index.open(self.task.index_location, self.task.index_type, self.loop) self.query_type = query_type self.retrieval_task = retrieval_task
def inspect(self, index_location, index_type, workdir='.', feature=None, interactive=False): if feature is None and not interactive: logger.error("Must either use --feature or --interactive") return if interactive: completer = SimpleCompleter([ r'\quit', 'summary', 'synonym-summary', 'context-summary', 'list-nodes', 'list-hyperedges' ]) readline.parse_and_bind("tab: complete") readline.set_completer(completer.complete) readline.set_completer_delims( readline.get_completer_delims().replace('\\', '')) try: loop = asyncio.get_event_loop() while True: try: if interactive: feature = input('feature> ') if feature == r'\quit': break if feature.strip() == '': continue index = Index.open(index_location, index_type, loop) loop.run_until_complete(index.inspect(feature, workdir)) except ArmyAntException as e: logger.error(e) except (EOFError, KeyboardInterrupt): print("\\quit") break if not interactive: break finally: loop.run_until_complete(loop.shutdown_asyncgens()) loop.close()
def __init__(self, task, eval_location): super().__init__(task, eval_location) try: base_url, api_key, run_id = eval_location.split('::') except ValueError: raise ArmyAntException( "Must provide the base_url, api_key and run_id, separated by '::'" ) self.base_url = urljoin(base_url, 'api/v2/participant/') self.auth = HTTPBasicAuth(api_key, '') self.headers = {'Content-Type': 'application/json'} requests_cache.install_cache('living_labs_cache', expire_after=10800) self.loop = asyncio.get_event_loop() self.index = Index.open(task.index_location, task.index_type, self.loop) self.run_id = run_id self.pickle_dir = '/opt/army-ant/cache/%s' % run_id if not os.path.exists(self.pickle_dir): os.mkdir(self.pickle_dir)
def __init__(self, task, eval_location): super().__init__(task, eval_location) self.loop = asyncio.get_event_loop() self.index = Index.open(self.task.index_location, self.task.index_type, self.loop)
def search(self, index_location, index_type, query_type=None, task=None, base_index_location=None, base_index_type=None, ranking_function=None, ranking_params=None, db_location='localhost', db_name=None, db_type='mongo', query=None, offset=0, limit=10, interactive=False): if query is None and not interactive: logger.error("Must either use --query or --interactive") return if interactive: completer = SimpleCompleter([ r'\quit', r'\set_ranking_random_walk', r'\set_ranking_biased_random_walk' ]) readline.parse_and_bind("tab: complete") readline.set_completer(completer.complete) readline.set_completer_delims( readline.get_completer_delims().replace('\\', '')) try: loop = asyncio.get_event_loop() while True: try: if interactive: query = input('query> ') if query == r'\quit': break if query.strip() == '': continue ranking = re.match(r'\\set_ranking_(.*)', query) if ranking: ranking_function = ranking.group(1) print("==> Switched to '%s' ranking function" % ranking_function) continue try: ranking_params = dict( tuple(param.split('=')) for param in ranking_params.split(',')) except Exception: logger.warning("Empty ranking parameters for %s" % ranking_function) ranking_params = {} index = Index.open(index_location, index_type, loop) response = loop.run_until_complete( index.search(query, offset, limit, query_type=query_type, task=task, base_index_location=base_index_location, base_index_type=base_index_location, ranking_function=ranking_function, ranking_params=ranking_params)) if db_location and db_name and db_type: db = Database.factory(db_location, db_name, db_type, loop) metadata = loop.run_until_complete( db.retrieve(response['results'])) else: metadata = [] for (result, i) in zip(response['results'], range(offset, offset + limit)): print("==> %3d %7.2f [%s] %s" % (i + 1, result['score'], result['type'], result['name'] if result['name'] else result['id'])) doc_id = result['id'] if doc_id in metadata: for item in metadata[doc_id].items(): print("\t%10s: %s" % item) print() except ArmyAntException as e: logger.error(e) except (EOFError, KeyboardInterrupt): print("\\quit") break if not interactive: break finally: loop.run_until_complete(loop.shutdown_asyncgens()) loop.close()
async def search(request): start_time = time.time() engine = request.GET.get('engine') if engine is None: engine = list(request.app['engines'].keys())[0] index_features = request.app['engines'][engine].get('features', []) ranking_function = request.GET.get('ranking_function') if ranking_function is None: ranking_function = request.app['engines'][engine].get( 'ranking', {}).get('default', {}).get('id') ranking_params = {} for k in request.GET.keys(): if k.startswith('ranking_param_'): _, _, param_name = k.split('_', 2) param_value = request.GET.get(k) ranking_params[param_name] = param_value debug = request.GET.get('debug', 'off') task = request.GET.get('task') base_index_location = request.GET.get('base_index_location') base_index_type = request.GET.get('base_index_type') query_type = request.GET.get('type') query = request.GET.get('query') error = None trace = None trace_ascii = None offset = 0 limit = 30 if debug == 'on' else 5 if query: offset = int(request.GET.get('offset', str(offset))) limit = int(request.GET.get('limit', str(limit))) try: loop = asyncio.get_event_loop() index = Index.open( request.app['engines'][engine]['index']['location'], request.app['engines'][engine]['index']['type'], loop) engine_response = await index.search( query, offset, limit, query_type=query_type, task=task, base_index_location=base_index_location, base_index_type=base_index_type, ranking_function=ranking_function, ranking_params=(ranking_params or None), debug=(debug == 'on')) num_docs = len(engine_response['results']) if engine_response['numDocs']: num_docs = engine_response['numDocs'] if type(num_docs) is java.lang.Long: num_docs = num_docs.longValue() if 'trace' in engine_response: trace = engine_response['trace'] if 'traceASCII' in engine_response: trace_ascii = engine_response['traceASCII'] results = engine_response['results'] page = int((offset + limit) / limit) pages = math.ceil(num_docs / limit) db = Database.factory( request.app['engines'][engine]['db']['location'], request.app['engines'][engine]['db']['name'], request.app['engines'][engine]['db']['type'], loop) metadata = await db.retrieve(results) except (ArmyAntException, ClientOSError) as e: error = e else: results = [] num_docs = 0 page = None pages = None metadata = {} end_time = time.time() if error: response = { 'engine': engine, 'index_features': index_features, 'task': task, 'query': query, 'query_type': query_type, 'debug': debug, 'time': end_time - start_time, 'error': str(error) } else: response = { 'engine': engine, 'index_features': index_features, 'task': task, 'rankingFunction': ranking_function, 'rankingParams': ranking_params, 'query': query, 'query_type': query_type, 'debug': debug, 'time': end_time - start_time, 'offset': offset, 'limit': limit, 'numDocs': num_docs, 'page': page, 'pages': pages, 'results': results, 'metadata': metadata, 'trace': trace, 'trace_ascii': trace_ascii } fmt = request.GET.get('format', 'html') if fmt == 'json': return web.json_response( response, dumps=lambda obj: json.dumps(obj, default=to_serializable)) else: return response
async def rank_correlation(index_a_location, index_a_type, index_b_location, index_b_type, ranking_fun_a, ranking_fun_b, ranking_params_a, ranking_params_b, topics_path, output_path, cutoff, repeats, method, force, loop): assert method in 'spearman' index_a = Index.open(index_a_location, index_a_type, loop) index_b = Index.open(index_b_location, index_b_type, loop) topics = etree.parse(topics_path) os.makedirs(output_path, exist_ok=True) correlations = pd.DataFrame(columns=[ 'topic_id', 'index_type_a', 'ranking_funtion_a', 'ranking_params_a', 'avg_num_results_a', 'index_type_b', 'ranking_funtion_b', 'ranking_params_b', 'avg_num_results_b', 'avg_rho']) for topic in topics.xpath('//topic'): topic_id = get_first(topic.xpath('@id')) query = get_first(topic.xpath('title/text()')) logger.info("Processing topic %s [ %s ]" % (topic_id, query)) path = os.path.join(output_path, 'topic_%s' % topic_id) os.makedirs(path, exist_ok=True) rhos = [] jaccards = [] num_results_a = [] num_results_b = [] for repeat in range(1, repeats + 1): filename_a = os.path.join(path, 'a_repeat_%%0%dd.csv' % len(str(repeats)) % repeat) filename_b = os.path.join(path, 'b_repeat_%%0%dd.csv' % len(str(repeats)) % repeat) if not force and os.path.exists(filename_a): df_a = pd.read_csv(filename_a, converters={'id': lambda d: str(d)}) logger.warning("Loaded existing file for repeat %d of index A: %s (use --force to recompute)" % ( repeat, filename_a)) else: result_set_a = await index_a.search( query, 0, cutoff, task=Index.RetrievalTask.document_retrieval, ranking_function=ranking_fun_a, ranking_params=ranking_params_a) df_a = pd.DataFrame(columns=['score', 'id']) for result in result_set_a: df_a = df_a.append({ 'score': result.score, 'id': result.id }, ignore_index=True) df_a.index += 1 df_a['rank'] = df_a.index df_a = df_a[['rank', 'score', 'id']] df_a.to_csv(filename_a, index=False) logger.info("Saved repeat %d for index A in %s" % (repeat, filename_a)) if not force and os.path.exists(filename_b): df_b = pd.read_csv(filename_b, converters={'id': lambda d: str(d)}) logger.warning("Loaded existing file for repeat %d of index B: %s (use --force to recompute)" % ( repeat, filename_b)) else: result_set_b = await index_b.search( query, 0, cutoff, task=Index.RetrievalTask.document_retrieval, ranking_function=ranking_fun_b, ranking_params=ranking_params_b) df_b = pd.DataFrame(columns=['score', 'id']) for result in result_set_b: df_b = df_b.append({ 'score': result.score, 'id': result.id }, ignore_index=True) df_b.index += 1 df_b['rank'] = df_b.index df_b = df_b[['rank', 'score', 'id']] df_b.to_csv(filename_b, index=False) logger.info("Saved repeat %d for index B in %s" % (repeat, filename_b)) num_results_a.append(len(df_a)) num_results_b.append(len(df_b)) rhos.append(spearman_rho(df_a, df_b)) jaccards.append(jaccard_index(df_a, df_b)) correlations = correlations.append({ 'topic_id': topic_id, 'index_type_a': index_a_type, 'ranking_funtion_a': ranking_fun_a, 'ranking_params_a': '_'.join('_'.join(d) for d in ranking_params_a.items()), 'avg_num_results_a': np.mean(num_results_a), 'index_type_b': index_b_type, 'ranking_funtion_b': ranking_fun_b, 'ranking_params_b': '_'.join('_'.join(d) for d in ranking_params_b.items()), 'avg_num_results_b': np.mean(num_results_b), 'avg_rho': np.mean(rhos), 'avg_jaccard': np.mean(jaccards) }, ignore_index=True) correlations_filename = os.path.join(output_path, 'comparison_per_topic-%d_repeats.csv' % repeats) correlations.to_csv(correlations_filename, index=False) logger.info( "Saved correlations per topic (%d repeats) to %s" % (repeats, correlations_filename)) mean_correlation = np.mean(correlations['avg_rho']) mean_correlation_filename = os.path.join(output_path, 'mean_correlation-%d_repeats' % repeats) open(mean_correlation_filename, 'w').write('%15f' % mean_correlation) logger.info("Saved mean correlation (%d repeats) to %s" % (repeats, mean_correlation_filename)) mean_jaccard = np.mean(correlations['avg_jaccard']) mean_jaccard_filename = os.path.join(output_path, 'mean_jaccard-%d_repeats' % repeats) open(mean_jaccard_filename, 'w').write('%15f' % mean_jaccard) logger.info("Saved mean Jaccard index (%d repeats) to %s" % (repeats, mean_jaccard_filename))
async def rws_rank_concordance(index_location, index_type, rw_length, rw_repeats, topics_path, output_path, cutoff, repeats, method, force, loop): assert method in 'kendall_w' index = Index.open(index_location, index_type, loop) topics = etree.parse(topics_path) os.makedirs(output_path, exist_ok=True) correlations = pd.DataFrame(columns=['l', 'r', 'topic_id']) for i in range(len(rw_length)): for j in range(len(rw_repeats)): for topic in topics.xpath('//topic'): topic_id = get_first(topic.xpath('@id')) query = get_first(topic.xpath('title/text()')) logger.info("Processing topic %s [ %s ], using l = %d and r = %d" % ( topic_id, query, rw_length[i], rw_repeats[j])) path = os.path.join(output_path, 'l_%d-r_%d' % (rw_length[i], rw_repeats[j]), 'topic_%s' % topic_id) os.makedirs(path, exist_ok=True) df_repeats = [] num_results = [] for repeat in range(1, repeats + 1): filename = os.path.join(path, 'repeat_%%0%dd.csv' % len(str(repeats)) % repeat) if not force and os.path.exists(filename): df = pd.read_csv(filename) df_repeats.append(df) num_results.append(len(df)) logger.warning("Loaded existing file for repeat %d: %s (use --force to recompute)" % ( repeat, filename)) continue result_set = await index.search(query, 0, cutoff, task=Index.RetrievalTask.document_retrieval, ranking_function='random_walk', ranking_params={'l': str(rw_length[i]), 'r': str(rw_repeats[j])}) df = pd.DataFrame(columns=['score', 'id']) for result in result_set: df = df.append({ 'score': result.score, 'id': result.id }, ignore_index=True) df.index += 1 df.to_csv(filename, index_label='rank') logger.info("Saved repeat %d in %s" % (repeat, filename)) df_repeats.append(pd.read_csv(filename)) num_results.append(len(result_set)) correlations = correlations.append({ 'l': rw_length[i], 'r': rw_repeats[j], 'topic_id': topic_id, 'avg_num_results': np.mean(num_results), 'w': kendall_w(df_repeats) }, ignore_index=True) correlations_filename = os.path.join(output_path, 'concordances_per_topic-%d_repeats.csv' % repeats) correlations.to_csv(correlations_filename, index=False) logger.info( "Saved concordances per topic (%d repeats) to %s" % (repeats, correlations_filename)) mean_correlation = correlations[['r', 'l', 'w']] \ .groupby(['l', 'r']) \ .agg(lambda x: gmean(x.values)) mean_correlation_filename = os.path.join(output_path, 'gmean_concordances-%d_repeats.csv' % repeats) mean_correlation.to_csv(mean_correlation_filename) logger.info("Saved geometric mean concordances (%d repeats) to %s" % (repeats, mean_correlation_filename))