예제 #1
0
async def autocomplete(request):
    engine = request.GET.get('engine')
    if engine is None:
        engine = list(request.app['engines'].keys())[0]

    substring = request.GET.get('substring')
    error = None

    if substring:
        try:
            loop = asyncio.get_event_loop()
            index = Index.open(
                request.app['engines'][engine]['index']['location'],
                request.app['engines'][engine]['index']['type'], loop)
            matches = await index.autocomplete(substring)
        except (ArmyAntException, ClientOSError) as e:
            error = e
    else:
        matches = []

    if error:
        response = {'error': str(error)}
    else:
        response = {'matches': matches}

    return web.json_response(
        response, dumps=lambda obj: json.dumps(obj, default=to_serializable))
예제 #2
0
    def __init__(self, task, eval_location, query_type, retrieval_task):
        super().__init__(task, eval_location)

        self.loop = asyncio.get_event_loop()
        self.index = Index.open(self.task.index_location, self.task.index_type,
                                self.loop)
        self.query_type = query_type
        self.retrieval_task = retrieval_task
예제 #3
0
    def inspect(self,
                index_location,
                index_type,
                workdir='.',
                feature=None,
                interactive=False):
        if feature is None and not interactive:
            logger.error("Must either use --feature or --interactive")
            return

        if interactive:
            completer = SimpleCompleter([
                r'\quit', 'summary', 'synonym-summary', 'context-summary',
                'list-nodes', 'list-hyperedges'
            ])
            readline.parse_and_bind("tab: complete")
            readline.set_completer(completer.complete)
            readline.set_completer_delims(
                readline.get_completer_delims().replace('\\', ''))

        try:
            loop = asyncio.get_event_loop()
            while True:
                try:
                    if interactive:
                        feature = input('feature> ')
                        if feature == r'\quit':
                            break
                        if feature.strip() == '':
                            continue

                    index = Index.open(index_location, index_type, loop)
                    loop.run_until_complete(index.inspect(feature, workdir))
                except ArmyAntException as e:
                    logger.error(e)
                except (EOFError, KeyboardInterrupt):
                    print("\\quit")
                    break

                if not interactive:
                    break
        finally:
            loop.run_until_complete(loop.shutdown_asyncgens())
            loop.close()
예제 #4
0
    def __init__(self, task, eval_location):
        super().__init__(task, eval_location)
        try:
            base_url, api_key, run_id = eval_location.split('::')
        except ValueError:
            raise ArmyAntException(
                "Must provide the base_url, api_key and run_id, separated by '::'"
            )

        self.base_url = urljoin(base_url, 'api/v2/participant/')
        self.auth = HTTPBasicAuth(api_key, '')
        self.headers = {'Content-Type': 'application/json'}

        requests_cache.install_cache('living_labs_cache', expire_after=10800)

        self.loop = asyncio.get_event_loop()
        self.index = Index.open(task.index_location, task.index_type,
                                self.loop)

        self.run_id = run_id
        self.pickle_dir = '/opt/army-ant/cache/%s' % run_id
        if not os.path.exists(self.pickle_dir):
            os.mkdir(self.pickle_dir)
예제 #5
0
    def __init__(self, task, eval_location):
        super().__init__(task, eval_location)

        self.loop = asyncio.get_event_loop()
        self.index = Index.open(self.task.index_location, self.task.index_type,
                                self.loop)
예제 #6
0
    def search(self,
               index_location,
               index_type,
               query_type=None,
               task=None,
               base_index_location=None,
               base_index_type=None,
               ranking_function=None,
               ranking_params=None,
               db_location='localhost',
               db_name=None,
               db_type='mongo',
               query=None,
               offset=0,
               limit=10,
               interactive=False):
        if query is None and not interactive:
            logger.error("Must either use --query or --interactive")
            return

        if interactive:
            completer = SimpleCompleter([
                r'\quit', r'\set_ranking_random_walk',
                r'\set_ranking_biased_random_walk'
            ])
            readline.parse_and_bind("tab: complete")
            readline.set_completer(completer.complete)
            readline.set_completer_delims(
                readline.get_completer_delims().replace('\\', ''))

        try:
            loop = asyncio.get_event_loop()
            while True:
                try:
                    if interactive:
                        query = input('query> ')
                        if query == r'\quit':
                            break
                        if query.strip() == '':
                            continue

                        ranking = re.match(r'\\set_ranking_(.*)', query)
                        if ranking:
                            ranking_function = ranking.group(1)
                            print("==> Switched to '%s' ranking function" %
                                  ranking_function)
                            continue

                    try:
                        ranking_params = dict(
                            tuple(param.split('='))
                            for param in ranking_params.split(','))
                    except Exception:
                        logger.warning("Empty ranking parameters for %s" %
                                       ranking_function)
                        ranking_params = {}

                    index = Index.open(index_location, index_type, loop)
                    response = loop.run_until_complete(
                        index.search(query,
                                     offset,
                                     limit,
                                     query_type=query_type,
                                     task=task,
                                     base_index_location=base_index_location,
                                     base_index_type=base_index_location,
                                     ranking_function=ranking_function,
                                     ranking_params=ranking_params))

                    if db_location and db_name and db_type:
                        db = Database.factory(db_location, db_name, db_type,
                                              loop)
                        metadata = loop.run_until_complete(
                            db.retrieve(response['results']))
                    else:
                        metadata = []

                    for (result, i) in zip(response['results'],
                                           range(offset, offset + limit)):
                        print("==> %3d %7.2f [%s] %s" %
                              (i + 1, result['score'], result['type'],
                               result['name']
                               if result['name'] else result['id']))
                        doc_id = result['id']
                        if doc_id in metadata:
                            for item in metadata[doc_id].items():
                                print("\t%10s: %s" % item)
                            print()
                except ArmyAntException as e:
                    logger.error(e)
                except (EOFError, KeyboardInterrupt):
                    print("\\quit")
                    break

                if not interactive:
                    break
        finally:
            loop.run_until_complete(loop.shutdown_asyncgens())
            loop.close()
예제 #7
0
async def search(request):
    start_time = time.time()

    engine = request.GET.get('engine')

    if engine is None:
        engine = list(request.app['engines'].keys())[0]

    index_features = request.app['engines'][engine].get('features', [])

    ranking_function = request.GET.get('ranking_function')
    if ranking_function is None:
        ranking_function = request.app['engines'][engine].get(
            'ranking', {}).get('default', {}).get('id')

    ranking_params = {}
    for k in request.GET.keys():
        if k.startswith('ranking_param_'):
            _, _, param_name = k.split('_', 2)
            param_value = request.GET.get(k)
            ranking_params[param_name] = param_value

    debug = request.GET.get('debug', 'off')

    task = request.GET.get('task')
    base_index_location = request.GET.get('base_index_location')
    base_index_type = request.GET.get('base_index_type')
    query_type = request.GET.get('type')
    query = request.GET.get('query')

    error = None
    trace = None
    trace_ascii = None
    offset = 0
    limit = 30 if debug == 'on' else 5

    if query:
        offset = int(request.GET.get('offset', str(offset)))
        limit = int(request.GET.get('limit', str(limit)))
        try:
            loop = asyncio.get_event_loop()
            index = Index.open(
                request.app['engines'][engine]['index']['location'],
                request.app['engines'][engine]['index']['type'], loop)
            engine_response = await index.search(
                query,
                offset,
                limit,
                query_type=query_type,
                task=task,
                base_index_location=base_index_location,
                base_index_type=base_index_type,
                ranking_function=ranking_function,
                ranking_params=(ranking_params or None),
                debug=(debug == 'on'))

            num_docs = len(engine_response['results'])
            if engine_response['numDocs']:
                num_docs = engine_response['numDocs']

            if type(num_docs) is java.lang.Long:
                num_docs = num_docs.longValue()

            if 'trace' in engine_response:
                trace = engine_response['trace']

            if 'traceASCII' in engine_response:
                trace_ascii = engine_response['traceASCII']

            results = engine_response['results']
            page = int((offset + limit) / limit)
            pages = math.ceil(num_docs / limit)

            db = Database.factory(
                request.app['engines'][engine]['db']['location'],
                request.app['engines'][engine]['db']['name'],
                request.app['engines'][engine]['db']['type'], loop)
            metadata = await db.retrieve(results)
        except (ArmyAntException, ClientOSError) as e:
            error = e
    else:
        results = []
        num_docs = 0
        page = None
        pages = None
        metadata = {}

    end_time = time.time()

    if error:
        response = {
            'engine': engine,
            'index_features': index_features,
            'task': task,
            'query': query,
            'query_type': query_type,
            'debug': debug,
            'time': end_time - start_time,
            'error': str(error)
        }
    else:
        response = {
            'engine': engine,
            'index_features': index_features,
            'task': task,
            'rankingFunction': ranking_function,
            'rankingParams': ranking_params,
            'query': query,
            'query_type': query_type,
            'debug': debug,
            'time': end_time - start_time,
            'offset': offset,
            'limit': limit,
            'numDocs': num_docs,
            'page': page,
            'pages': pages,
            'results': results,
            'metadata': metadata,
            'trace': trace,
            'trace_ascii': trace_ascii
        }

    fmt = request.GET.get('format', 'html')
    if fmt == 'json':
        return web.json_response(
            response,
            dumps=lambda obj: json.dumps(obj, default=to_serializable))
    else:
        return response
예제 #8
0
async def rank_correlation(index_a_location, index_a_type, index_b_location, index_b_type,
                           ranking_fun_a, ranking_fun_b, ranking_params_a, ranking_params_b,
                           topics_path, output_path, cutoff, repeats, method, force, loop):
    assert method in 'spearman'

    index_a = Index.open(index_a_location, index_a_type, loop)
    index_b = Index.open(index_b_location, index_b_type, loop)
    topics = etree.parse(topics_path)

    os.makedirs(output_path, exist_ok=True)

    correlations = pd.DataFrame(columns=[
        'topic_id', 'index_type_a', 'ranking_funtion_a', 'ranking_params_a', 'avg_num_results_a',
        'index_type_b', 'ranking_funtion_b', 'ranking_params_b', 'avg_num_results_b', 'avg_rho'])

    for topic in topics.xpath('//topic'):
        topic_id = get_first(topic.xpath('@id'))
        query = get_first(topic.xpath('title/text()'))

        logger.info("Processing topic %s [ %s ]" % (topic_id, query))

        path = os.path.join(output_path, 'topic_%s' % topic_id)
        os.makedirs(path, exist_ok=True)

        rhos = []
        jaccards = []
        num_results_a = []
        num_results_b = []

        for repeat in range(1, repeats + 1):
            filename_a = os.path.join(path, 'a_repeat_%%0%dd.csv' % len(str(repeats)) % repeat)
            filename_b = os.path.join(path, 'b_repeat_%%0%dd.csv' % len(str(repeats)) % repeat)

            if not force and os.path.exists(filename_a):
                df_a = pd.read_csv(filename_a, converters={'id': lambda d: str(d)})
                logger.warning("Loaded existing file for repeat %d of index A: %s (use --force to recompute)" % (
                    repeat, filename_a))
            else:
                result_set_a = await index_a.search(
                    query, 0, cutoff, task=Index.RetrievalTask.document_retrieval,
                    ranking_function=ranking_fun_a, ranking_params=ranking_params_a)
                df_a = pd.DataFrame(columns=['score', 'id'])

                for result in result_set_a:
                    df_a = df_a.append({
                        'score': result.score,
                        'id': result.id
                    }, ignore_index=True)

                df_a.index += 1
                df_a['rank'] = df_a.index
                df_a = df_a[['rank', 'score', 'id']]
                df_a.to_csv(filename_a, index=False)

                logger.info("Saved repeat %d for index A in %s" % (repeat, filename_a))

            if not force and os.path.exists(filename_b):
                df_b = pd.read_csv(filename_b, converters={'id': lambda d: str(d)})
                logger.warning("Loaded existing file for repeat %d of index B: %s (use --force to recompute)" % (
                    repeat, filename_b))
            else:
                result_set_b = await index_b.search(
                    query, 0, cutoff, task=Index.RetrievalTask.document_retrieval,
                    ranking_function=ranking_fun_b, ranking_params=ranking_params_b)
                df_b = pd.DataFrame(columns=['score', 'id'])

                for result in result_set_b:
                    df_b = df_b.append({
                        'score': result.score,
                        'id': result.id
                    }, ignore_index=True)

                df_b.index += 1
                df_b['rank'] = df_b.index
                df_b = df_b[['rank', 'score', 'id']]
                df_b.to_csv(filename_b, index=False)

                logger.info("Saved repeat %d for index B in %s" % (repeat, filename_b))

            num_results_a.append(len(df_a))
            num_results_b.append(len(df_b))
            rhos.append(spearman_rho(df_a, df_b))
            jaccards.append(jaccard_index(df_a, df_b))

        correlations = correlations.append({
            'topic_id': topic_id,
            'index_type_a': index_a_type,
            'ranking_funtion_a': ranking_fun_a,
            'ranking_params_a': '_'.join('_'.join(d) for d in ranking_params_a.items()),
            'avg_num_results_a': np.mean(num_results_a),
            'index_type_b': index_b_type,
            'ranking_funtion_b': ranking_fun_b,
            'ranking_params_b': '_'.join('_'.join(d) for d in ranking_params_b.items()),
            'avg_num_results_b': np.mean(num_results_b),
            'avg_rho': np.mean(rhos),
            'avg_jaccard': np.mean(jaccards)
        }, ignore_index=True)

    correlations_filename = os.path.join(output_path, 'comparison_per_topic-%d_repeats.csv' % repeats)
    correlations.to_csv(correlations_filename, index=False)
    logger.info(
        "Saved correlations per topic (%d repeats) to %s" % (repeats, correlations_filename))

    mean_correlation = np.mean(correlations['avg_rho'])
    mean_correlation_filename = os.path.join(output_path, 'mean_correlation-%d_repeats' % repeats)
    open(mean_correlation_filename, 'w').write('%15f' % mean_correlation)
    logger.info("Saved mean correlation (%d repeats) to %s" % (repeats, mean_correlation_filename))

    mean_jaccard = np.mean(correlations['avg_jaccard'])
    mean_jaccard_filename = os.path.join(output_path, 'mean_jaccard-%d_repeats' % repeats)
    open(mean_jaccard_filename, 'w').write('%15f' % mean_jaccard)
    logger.info("Saved mean Jaccard index (%d repeats) to %s" % (repeats, mean_jaccard_filename))
예제 #9
0
async def rws_rank_concordance(index_location, index_type, rw_length, rw_repeats, topics_path, output_path,
                               cutoff, repeats, method, force, loop):
    assert method in 'kendall_w'

    index = Index.open(index_location, index_type, loop)
    topics = etree.parse(topics_path)

    os.makedirs(output_path, exist_ok=True)

    correlations = pd.DataFrame(columns=['l', 'r', 'topic_id'])

    for i in range(len(rw_length)):
        for j in range(len(rw_repeats)):
            for topic in topics.xpath('//topic'):
                topic_id = get_first(topic.xpath('@id'))
                query = get_first(topic.xpath('title/text()'))

                logger.info("Processing topic %s [ %s ], using l = %d and r = %d" % (
                    topic_id, query, rw_length[i], rw_repeats[j]))

                path = os.path.join(output_path, 'l_%d-r_%d' % (rw_length[i], rw_repeats[j]), 'topic_%s' % topic_id)
                os.makedirs(path, exist_ok=True)

                df_repeats = []
                num_results = []

                for repeat in range(1, repeats + 1):
                    filename = os.path.join(path, 'repeat_%%0%dd.csv' % len(str(repeats)) % repeat)

                    if not force and os.path.exists(filename):
                        df = pd.read_csv(filename)
                        df_repeats.append(df)
                        num_results.append(len(df))
                        logger.warning("Loaded existing file for repeat %d: %s (use --force to recompute)" % (
                            repeat, filename))
                        continue

                    result_set = await index.search(query, 0, cutoff,
                                                    task=Index.RetrievalTask.document_retrieval,
                                                    ranking_function='random_walk',
                                                    ranking_params={'l': str(rw_length[i]), 'r': str(rw_repeats[j])})
                    df = pd.DataFrame(columns=['score', 'id'])

                    for result in result_set:
                        df = df.append({
                            'score': result.score,
                            'id': result.id
                        }, ignore_index=True)

                    df.index += 1
                    df.to_csv(filename, index_label='rank')

                    logger.info("Saved repeat %d in %s" % (repeat, filename))

                    df_repeats.append(pd.read_csv(filename))
                    num_results.append(len(result_set))

                correlations = correlations.append({
                    'l': rw_length[i],
                    'r': rw_repeats[j],
                    'topic_id': topic_id,
                    'avg_num_results': np.mean(num_results),
                    'w': kendall_w(df_repeats)
                }, ignore_index=True)

    correlations_filename = os.path.join(output_path, 'concordances_per_topic-%d_repeats.csv' % repeats)
    correlations.to_csv(correlations_filename, index=False)
    logger.info(
        "Saved concordances per topic (%d repeats) to %s" % (repeats, correlations_filename))

    mean_correlation = correlations[['r', 'l', 'w']] \
        .groupby(['l', 'r']) \
        .agg(lambda x: gmean(x.values))

    mean_correlation_filename = os.path.join(output_path, 'gmean_concordances-%d_repeats.csv' % repeats)
    mean_correlation.to_csv(mean_correlation_filename)
    logger.info("Saved geometric mean concordances (%d repeats) to %s" % (repeats, mean_correlation_filename))