Example #1
0
def clio_search_iter(url, index, chunksize=1000, scroll='1m', **kwargs):
    """Perform a *bulk* (streamed) contextual search of Elasticsearch data.

    Args:
        url (str): URL path to bare ES endpoint.
        index (str): Index to query.
        chunksize (int): Chunk size to retrieve from Elasticsearch.
        query (str): The simple text query to Elasticsearch.
        fields (list): List of fields to query.
        n_seed_docs (int): Number of seed documents to retrieve.
        min_term_freq (int): Only consider seed terms which occur in all
                               documents with this frequency.
        max_query_terms (int): Maximum number of important terms to
                                  identify in the seed documents.
        min_doc_frac (float): Only consider seed terms which appear more
                                    than this fraction of the seed docs.
        max_doc_frac (float): Only consider seed terms which appear less
                                  than this fraction of the seed docs.
        min_should_match (float): Fraction of important terms from the
                                      seed docs explicitly required to match.
        {pre,post}_filters (list): ES filters to supply to the
                                   {seed,expanded} queries.
        stop_words (list): A supplementary list of terms to ignore. Defaults
                           to standard English stop words.
        scroll (str): ES scroll time window (e.g. '1m').
    Yields:
        Single rows of data
    """
    try_pop(kwargs, 'limit')  # Ignore limit and offset
    try_pop(kwargs, 'offset')
    if chunksize > MAX_CHUNKSIZE:
        logging.warning(
            f'Will not consider chunksize greater than {MAX_CHUNKSIZE}. '
            f'Reverting to chunksize={MAX_CHUNKSIZE}.')
    # First search
    scroll_id, docs = clio_search(url=url,
                                  index=index,
                                  limit=chunksize,
                                  scroll=scroll,
                                  **kwargs)
    for row in docs:
        yield row
    # Keep scrolling if required
    endpoint = urllib.parse.urljoin(f'{url}/', '_search/scroll')
    while len(docs) == chunksize:
        r = requests.post(endpoint,
                          data=json.dumps({
                              'scroll': scroll,
                              'scroll_id': scroll_id
                          }),
                          headers={'Content-Type': 'application/json'})
        _, docs = extract_docs(r)
        for row in docs:
            yield row
Example #2
0
def test_try_pop():
    data = {'a': 'A', 'b': 'B', 'c': 'C'}
    assert try_pop(data, 'a', 'AA') == 'A'
    assert try_pop(data, 'b', 'BB') == 'B'
    assert try_pop(data, 'c', 'CC') == 'C'

    assert try_pop(data, 'a', 'BB') == 'BB'
    assert try_pop(data, 'b', 'AB') == 'AB'
    assert try_pop(data, 'c', 'CA') == 'CA'

    assert try_pop(data, 'd', 'DD') == 'DD'
    assert try_pop(data, 'd') is None
Example #3
0
def lambda_handler(event, context=None):
    """The 'main' function: Process the API Gateway Event
    passed to Lambda by
    performing an expansion on the original ES query."""

    query = json.loads(event['body'])

    # Strip out any extreme upper limits from the post_filter
    try:
        post_filter = query['post_filter']
    except KeyError:
        pass
    else:
        if 'range' in post_filter:
            pop_upper_lim(post_filter['range'])
        elif 'bool' in post_filter:
            for row in post_filter['bool']['must']:
                if 'range' not in row:
                    continue
                pop_upper_lim(row['range'])

    # Fixes new host-authentication permissions, as this value
    # won't match the lambda host
    if 'Host' in event['headers']:
        event['headers'].pop('Host')

    # Generate the endpoint URL, and validate
    endpoint = event['headers'].pop('es-endpoint')
    if endpoint not in os.environ['ALLOWED_ENDPOINTS'].split(";"):
        raise ValueError(f'{endpoint} has not been registered')
    slug = event['pathParameters']['proxy']
    # If not a search query, return
    if not slug.endswith("_search") or 'query' not in query:
        url = f"https://{endpoint}/{slug}"
        r = requests.post(url,
                          data=json.dumps(query),
                          params={"rest_total_hits_as_int": "true"},
                          headers=event['headers'])
        return format_response(r)

    # Convert the request info ready for clio_search
    index = slug[:-8]  # removes "/_search"
    limit = try_pop(query, 'size')
    offset = try_pop(query, 'from')
    min_term_freq = try_pop(query, 'min_term_freq', 1)
    max_query_terms = try_pop(query, 'max_query_terms', 10)
    min_doc_frac = try_pop(query, 'min_doc_frac', 0.001)
    max_doc_frac = try_pop(query, 'max_doc_frac', 0.90)
    min_should_match = try_pop(query, 'minimum_should_match', 0.2)
    old_query = deepcopy(try_pop(query, 'query'))
    fields = extract_fields(old_query)

    # Make the search
    _, r = clio_search(f"https://{endpoint}",
                       index,
                       old_query,
                       fields=fields,
                       limit=limit,
                       offset=offset,
                       min_term_freq=min_term_freq,
                       max_query_terms=max_query_terms,
                       min_doc_frac=min_doc_frac,
                       max_doc_frac=max_doc_frac,
                       min_should_match=min_should_match,
                       post_aggregation=query,
                       response_mode=True,
                       headers=event['headers'])

    return format_response(r)
def lambda_handler(event, context=None):
    """The 'main' function: Process the API Gateway Event
    passed to Lambda by
    performing an expansion on the original ES query."""

    query = json.loads(event['body'])

    # Strip out any extreme upper limits from the post_filter
    try:
        post_filter = query['post_filter']
    except KeyError:
        pass
    else:
        print(post_filter)
        if 'range' in post_filter:
            pop_upper_lim(post_filter['range'])
        elif 'bool' in post_filter:
            for row in post_filter['bool']['must']:
                if 'range' not in row:
                    continue
                pop_upper_lim(row['range'])

    # Generate the endpoint URL, and validate
    endpoint = event['headers'].pop('es-endpoint')
    if endpoint not in os.environ['ALLOWED_ENDPOINTS'].split(";"):
        raise ValueError(f'{endpoint} has not been registered')

    url = f"https://{endpoint}/{event['pathParameters']['proxy']}"
    # If not a search query, return
    if not url.endswith("_search") or 'query' not in query:
        r = requests.post(url, data=json.dumps(query),
                          headers=event['headers'])
        return format_response(r)

    # Extract info from the query as required
    _from = try_pop(query, 'from')
    _size = try_pop(query, 'size')
    min_term_freq = try_pop(query, 'min_term_freq', 1)
    max_query_terms = try_pop(query, 'max_query_terms', 10)
    min_doc_freq = try_pop(query, 'min_doc_freq', 0.001)
    max_doc_frac = try_pop(query, 'max_doc_frac', 0.90)
    minimum_should_match = try_pop(query, 'minimum_should_match',
                                   '20%')

    # Make the initial request
    old_query = deepcopy(try_pop(query, 'query'))
    fields = extract_fields(old_query)
    r = simple_query(url, old_query, event, fields)
    total, docs = extract_docs(r)
    # If no results, give up
    if total == 0:
        return format_response(r)

    # Formulate the MLT query
    max_doc_freq = int(max_doc_frac*total)
    min_doc_freq = int(min_doc_freq*total)
    mlt_query = {"query":
                 {"more_like_this":
                  {"fields": fields,
                   "like": docs,
                   "min_term_freq": min_term_freq,
                   "max_query_terms": max_query_terms,
                   "min_doc_freq": min_doc_freq,
                   "max_doc_freq": max_doc_freq,
                   "boost_terms": 1.,
                   "minimum_should_match": minimum_should_match,
                   "include": True}}}
    if _from is not None and _from < total:
        mlt_query['from'] = _from
    if _size is not None:
        mlt_query['size'] = _size

    # Make the new query and return
    r_mlt = requests.post(url, data=json.dumps(dict(**query,
                                                    **mlt_query)),
                          headers=event['headers'],
                          params={"search_type": "dfs_query_then_fetch"})
    # If successful, return
    return format_response(r_mlt)