Esempio n. 1
0
def simple_query(endpoint,
                 query,
                 fields,
                 filters,
                 size=None,
                 aggregations=None,
                 response_mode=False,
                 **kwargs):
    """Perform a simple query on Elasticsearch.

    Args:
        url (str): The Elasticsearch endpoint.
        query (str): The query to make to ES.
        fields (list): List of fields to query.
        filters (list): List of ES filters.
        size (int): Number of documents to return.
        aggregations: Do not use this directly. See :obj:`clio_keywords`.
        response_mode: Do not use this directly. See :obj:`clio_lite_searchkit_lambda`.
    Returns:
        {total, docs} (tuple): {total number of docs}, {top :obj:`size` docs}
    """
    _query = {"_source": False}
    if type(query) is dict:
        _query['query'] = query
    else:
        _query['query'] = {
            "bool": {
                "must": [{
                    "multi_match": {
                        "query": query.lower(),
                        "fields": fields
                    }
                }],
                "filter":
                filters
            }
        }

    # Assume that if you want aggregations, you don't want anything else
    if aggregations is not None:
        _query['aggregations'] = aggregations
        _query['size'] = 0
        _query.pop('_source')
    elif size is not None:
        _query['size'] = size
    # Make the query
    logging.debug(_query)
    r = requests.post(url=endpoint,
                      data=json.dumps(_query),
                      params={"search_type": "dfs_query_then_fetch"},
                      **kwargs)
    # "Aggregation mode"
    if aggregations is not None:
        return extract_keywords(r)

    total, docs = extract_docs(r)
    # "Response mode"
    if response_mode and total == 0:
        return total, r
    return total, docs
Esempio n. 2
0
def clio_search_iter(url, index, chunksize=1000, scroll='1m', **kwargs):
    """Perform a *bulk* (streamed) contextual search of Elasticsearch data.

    Args:
        url (str): URL path to bare ES endpoint.
        index (str): Index to query.
        chunksize (int): Chunk size to retrieve from Elasticsearch.
        query (str): The simple text query to Elasticsearch.
        fields (list): List of fields to query.
        n_seed_docs (int): Number of seed documents to retrieve.
        min_term_freq (int): Only consider seed terms which occur in all
                               documents with this frequency.
        max_query_terms (int): Maximum number of important terms to
                                  identify in the seed documents.
        min_doc_frac (float): Only consider seed terms which appear more
                                    than this fraction of the seed docs.
        max_doc_frac (float): Only consider seed terms which appear less
                                  than this fraction of the seed docs.
        min_should_match (float): Fraction of important terms from the
                                      seed docs explicitly required to match.
        {pre,post}_filters (list): ES filters to supply to the
                                   {seed,expanded} queries.
        stop_words (list): A supplementary list of terms to ignore. Defaults
                           to standard English stop words.
        scroll (str): ES scroll time window (e.g. '1m').
    Yields:
        Single rows of data
    """
    try_pop(kwargs, 'limit')  # Ignore limit and offset
    try_pop(kwargs, 'offset')
    if chunksize > MAX_CHUNKSIZE:
        logging.warning(
            f'Will not consider chunksize greater than {MAX_CHUNKSIZE}. '
            f'Reverting to chunksize={MAX_CHUNKSIZE}.')
    # First search
    scroll_id, docs = clio_search(url=url,
                                  index=index,
                                  limit=chunksize,
                                  scroll=scroll,
                                  **kwargs)
    for row in docs:
        yield row
    # Keep scrolling if required
    endpoint = urllib.parse.urljoin(f'{url}/', '_search/scroll')
    while len(docs) == chunksize:
        r = requests.post(endpoint,
                          data=json.dumps({
                              'scroll': scroll,
                              'scroll_id': scroll_id
                          }),
                          headers={'Content-Type': 'application/json'})
        _, docs = extract_docs(r)
        for row in docs:
            yield row
Esempio n. 3
0
def test_extract_docs(mocked_json):
    mocked_response = mock.MagicMock()
    hits = [{
        '_id': 'something',
        '_index': 'something',
        '_source': {
            'something': 'else'
        }
    }] * 100
    _total = 10
    mocked_json.loads.return_value = {'hits': {'total': _total, 'hits': hits}}
    total, docs = extract_docs(mocked_response)
    assert total == _total
    assert len(docs) == len(hits)
Esempio n. 4
0
def more_like_this(endpoint,
                   docs,
                   fields,
                   limit,
                   offset,
                   min_term_freq,
                   max_query_terms,
                   min_doc_frac,
                   max_doc_frac,
                   min_should_match,
                   total,
                   stop_words=STOP_WORDS,
                   filters=[],
                   scroll=None,
                   response_mode=False,
                   post_aggregation={},
                   **kwargs):
    """Make an MLT query

    Args:
        endpoint (str): URL path to _search endpoint
        docs (list): Document index and ids to expand from.
        fields (list): List of fields to query.
        limit (int): Number of documents to return.
        offset (int): Offset from the highest ranked document.
        n_seed_docs (int): Use a maxmimum of this many seed documents.
        min_term_freq (int): Only consider seed terms which occur in all
                               documents with this frequency.
        max_query_terms (int): Maximum number of important terms to
                                  identify in the seed documents.
        min_doc_frac (float): Only consider seed terms which appear more
                                    than this fraction of the seed docs.
        max_doc_frac (float): Only consider seed terms which appear less
                                  than this fraction of the seed docs.
        min_should_match (float): Fraction of important terms from the
                                      seed docs explicitly required to match.
        stop_words (list): A supplementary list of terms to ignore. Defaults
                           to standard English stop words.
        filters (list): ES filters to supply to the query.
        scroll (str): ES scroll time window (e.g. '1m').
    Returns:
        {total, docs} (tuple): {total number of docs}, {top :obj:`size` docs}.
    """
    # If there are no documents to expand from
    if total == 0:
        return (0, [])
    # Check that the fractions are fractions, to avoid weird behaviour
    assert_fraction(min_should_match)
    assert_fraction(min_doc_frac)
    assert_fraction(max_doc_frac)

    # Formulate the MLT query
    msm = int(min_should_match * 100)
    max_doc_freq = int(max_doc_frac * total)
    min_doc_freq = int(min_doc_frac * total)
    mlt = {
        "more_like_this": {
            "fields": fields if fields != [] else None,
            "like": docs,
            "min_term_freq": min_term_freq,
            "max_query_terms": max_query_terms,
            "min_doc_freq": min_doc_freq,
            "max_doc_freq": max_doc_freq,
            "boost_terms": 1,
            "stop_words": stop_words,
            "minimum_should_match": f'{msm}%',
            "include": True,
        }
    }
    _query = {"query": {"bool": {"filter": filters, "must": [mlt]}}}
    params = {"search_type": "dfs_query_then_fetch"}
    # Offset assumes no scrolling (since it would be invalid)
    if offset is not None and offset < total:
        _query['from'] = offset
    # If scrolling was specified
    elif scroll is not None:
        params['scroll'] = scroll
    # The number of docs returned
    if limit is not None:
        _query['size'] = limit
    # Make the query
    logging.debug(_query)
    r = requests.post(url=endpoint,
                      data=json.dumps(dict(**post_aggregation, **_query)),
                      params=params,
                      **kwargs)
    if response_mode:
        return None, r
    # If successful, return
    return extract_docs(r, scroll=scroll, include_score=True)
def lambda_handler(event, context=None):
    """The 'main' function: Process the API Gateway Event
    passed to Lambda by
    performing an expansion on the original ES query."""

    query = json.loads(event['body'])

    # Strip out any extreme upper limits from the post_filter
    try:
        post_filter = query['post_filter']
    except KeyError:
        pass
    else:
        print(post_filter)
        if 'range' in post_filter:
            pop_upper_lim(post_filter['range'])
        elif 'bool' in post_filter:
            for row in post_filter['bool']['must']:
                if 'range' not in row:
                    continue
                pop_upper_lim(row['range'])

    # Generate the endpoint URL, and validate
    endpoint = event['headers'].pop('es-endpoint')
    if endpoint not in os.environ['ALLOWED_ENDPOINTS'].split(";"):
        raise ValueError(f'{endpoint} has not been registered')

    url = f"https://{endpoint}/{event['pathParameters']['proxy']}"
    # If not a search query, return
    if not url.endswith("_search") or 'query' not in query:
        r = requests.post(url, data=json.dumps(query),
                          headers=event['headers'])
        return format_response(r)

    # Extract info from the query as required
    _from = try_pop(query, 'from')
    _size = try_pop(query, 'size')
    min_term_freq = try_pop(query, 'min_term_freq', 1)
    max_query_terms = try_pop(query, 'max_query_terms', 10)
    min_doc_freq = try_pop(query, 'min_doc_freq', 0.001)
    max_doc_frac = try_pop(query, 'max_doc_frac', 0.90)
    minimum_should_match = try_pop(query, 'minimum_should_match',
                                   '20%')

    # Make the initial request
    old_query = deepcopy(try_pop(query, 'query'))
    fields = extract_fields(old_query)
    r = simple_query(url, old_query, event, fields)
    total, docs = extract_docs(r)
    # If no results, give up
    if total == 0:
        return format_response(r)

    # Formulate the MLT query
    max_doc_freq = int(max_doc_frac*total)
    min_doc_freq = int(min_doc_freq*total)
    mlt_query = {"query":
                 {"more_like_this":
                  {"fields": fields,
                   "like": docs,
                   "min_term_freq": min_term_freq,
                   "max_query_terms": max_query_terms,
                   "min_doc_freq": min_doc_freq,
                   "max_doc_freq": max_doc_freq,
                   "boost_terms": 1.,
                   "minimum_should_match": minimum_should_match,
                   "include": True}}}
    if _from is not None and _from < total:
        mlt_query['from'] = _from
    if _size is not None:
        mlt_query['size'] = _size

    # Make the new query and return
    r_mlt = requests.post(url, data=json.dumps(dict(**query,
                                                    **mlt_query)),
                          headers=event['headers'],
                          params={"search_type": "dfs_query_then_fetch"})
    # If successful, return
    return format_response(r_mlt)