def simple_query(endpoint, query, fields, filters, size=None, aggregations=None, response_mode=False, **kwargs): """Perform a simple query on Elasticsearch. Args: url (str): The Elasticsearch endpoint. query (str): The query to make to ES. fields (list): List of fields to query. filters (list): List of ES filters. size (int): Number of documents to return. aggregations: Do not use this directly. See :obj:`clio_keywords`. response_mode: Do not use this directly. See :obj:`clio_lite_searchkit_lambda`. Returns: {total, docs} (tuple): {total number of docs}, {top :obj:`size` docs} """ _query = {"_source": False} if type(query) is dict: _query['query'] = query else: _query['query'] = { "bool": { "must": [{ "multi_match": { "query": query.lower(), "fields": fields } }], "filter": filters } } # Assume that if you want aggregations, you don't want anything else if aggregations is not None: _query['aggregations'] = aggregations _query['size'] = 0 _query.pop('_source') elif size is not None: _query['size'] = size # Make the query logging.debug(_query) r = requests.post(url=endpoint, data=json.dumps(_query), params={"search_type": "dfs_query_then_fetch"}, **kwargs) # "Aggregation mode" if aggregations is not None: return extract_keywords(r) total, docs = extract_docs(r) # "Response mode" if response_mode and total == 0: return total, r return total, docs
def clio_search_iter(url, index, chunksize=1000, scroll='1m', **kwargs): """Perform a *bulk* (streamed) contextual search of Elasticsearch data. Args: url (str): URL path to bare ES endpoint. index (str): Index to query. chunksize (int): Chunk size to retrieve from Elasticsearch. query (str): The simple text query to Elasticsearch. fields (list): List of fields to query. n_seed_docs (int): Number of seed documents to retrieve. min_term_freq (int): Only consider seed terms which occur in all documents with this frequency. max_query_terms (int): Maximum number of important terms to identify in the seed documents. min_doc_frac (float): Only consider seed terms which appear more than this fraction of the seed docs. max_doc_frac (float): Only consider seed terms which appear less than this fraction of the seed docs. min_should_match (float): Fraction of important terms from the seed docs explicitly required to match. {pre,post}_filters (list): ES filters to supply to the {seed,expanded} queries. stop_words (list): A supplementary list of terms to ignore. Defaults to standard English stop words. scroll (str): ES scroll time window (e.g. '1m'). Yields: Single rows of data """ try_pop(kwargs, 'limit') # Ignore limit and offset try_pop(kwargs, 'offset') if chunksize > MAX_CHUNKSIZE: logging.warning( f'Will not consider chunksize greater than {MAX_CHUNKSIZE}. ' f'Reverting to chunksize={MAX_CHUNKSIZE}.') # First search scroll_id, docs = clio_search(url=url, index=index, limit=chunksize, scroll=scroll, **kwargs) for row in docs: yield row # Keep scrolling if required endpoint = urllib.parse.urljoin(f'{url}/', '_search/scroll') while len(docs) == chunksize: r = requests.post(endpoint, data=json.dumps({ 'scroll': scroll, 'scroll_id': scroll_id }), headers={'Content-Type': 'application/json'}) _, docs = extract_docs(r) for row in docs: yield row
def test_extract_docs(mocked_json): mocked_response = mock.MagicMock() hits = [{ '_id': 'something', '_index': 'something', '_source': { 'something': 'else' } }] * 100 _total = 10 mocked_json.loads.return_value = {'hits': {'total': _total, 'hits': hits}} total, docs = extract_docs(mocked_response) assert total == _total assert len(docs) == len(hits)
def more_like_this(endpoint, docs, fields, limit, offset, min_term_freq, max_query_terms, min_doc_frac, max_doc_frac, min_should_match, total, stop_words=STOP_WORDS, filters=[], scroll=None, response_mode=False, post_aggregation={}, **kwargs): """Make an MLT query Args: endpoint (str): URL path to _search endpoint docs (list): Document index and ids to expand from. fields (list): List of fields to query. limit (int): Number of documents to return. offset (int): Offset from the highest ranked document. n_seed_docs (int): Use a maxmimum of this many seed documents. min_term_freq (int): Only consider seed terms which occur in all documents with this frequency. max_query_terms (int): Maximum number of important terms to identify in the seed documents. min_doc_frac (float): Only consider seed terms which appear more than this fraction of the seed docs. max_doc_frac (float): Only consider seed terms which appear less than this fraction of the seed docs. min_should_match (float): Fraction of important terms from the seed docs explicitly required to match. stop_words (list): A supplementary list of terms to ignore. Defaults to standard English stop words. filters (list): ES filters to supply to the query. scroll (str): ES scroll time window (e.g. '1m'). Returns: {total, docs} (tuple): {total number of docs}, {top :obj:`size` docs}. """ # If there are no documents to expand from if total == 0: return (0, []) # Check that the fractions are fractions, to avoid weird behaviour assert_fraction(min_should_match) assert_fraction(min_doc_frac) assert_fraction(max_doc_frac) # Formulate the MLT query msm = int(min_should_match * 100) max_doc_freq = int(max_doc_frac * total) min_doc_freq = int(min_doc_frac * total) mlt = { "more_like_this": { "fields": fields if fields != [] else None, "like": docs, "min_term_freq": min_term_freq, "max_query_terms": max_query_terms, "min_doc_freq": min_doc_freq, "max_doc_freq": max_doc_freq, "boost_terms": 1, "stop_words": stop_words, "minimum_should_match": f'{msm}%', "include": True, } } _query = {"query": {"bool": {"filter": filters, "must": [mlt]}}} params = {"search_type": "dfs_query_then_fetch"} # Offset assumes no scrolling (since it would be invalid) if offset is not None and offset < total: _query['from'] = offset # If scrolling was specified elif scroll is not None: params['scroll'] = scroll # The number of docs returned if limit is not None: _query['size'] = limit # Make the query logging.debug(_query) r = requests.post(url=endpoint, data=json.dumps(dict(**post_aggregation, **_query)), params=params, **kwargs) if response_mode: return None, r # If successful, return return extract_docs(r, scroll=scroll, include_score=True)
def lambda_handler(event, context=None): """The 'main' function: Process the API Gateway Event passed to Lambda by performing an expansion on the original ES query.""" query = json.loads(event['body']) # Strip out any extreme upper limits from the post_filter try: post_filter = query['post_filter'] except KeyError: pass else: print(post_filter) if 'range' in post_filter: pop_upper_lim(post_filter['range']) elif 'bool' in post_filter: for row in post_filter['bool']['must']: if 'range' not in row: continue pop_upper_lim(row['range']) # Generate the endpoint URL, and validate endpoint = event['headers'].pop('es-endpoint') if endpoint not in os.environ['ALLOWED_ENDPOINTS'].split(";"): raise ValueError(f'{endpoint} has not been registered') url = f"https://{endpoint}/{event['pathParameters']['proxy']}" # If not a search query, return if not url.endswith("_search") or 'query' not in query: r = requests.post(url, data=json.dumps(query), headers=event['headers']) return format_response(r) # Extract info from the query as required _from = try_pop(query, 'from') _size = try_pop(query, 'size') min_term_freq = try_pop(query, 'min_term_freq', 1) max_query_terms = try_pop(query, 'max_query_terms', 10) min_doc_freq = try_pop(query, 'min_doc_freq', 0.001) max_doc_frac = try_pop(query, 'max_doc_frac', 0.90) minimum_should_match = try_pop(query, 'minimum_should_match', '20%') # Make the initial request old_query = deepcopy(try_pop(query, 'query')) fields = extract_fields(old_query) r = simple_query(url, old_query, event, fields) total, docs = extract_docs(r) # If no results, give up if total == 0: return format_response(r) # Formulate the MLT query max_doc_freq = int(max_doc_frac*total) min_doc_freq = int(min_doc_freq*total) mlt_query = {"query": {"more_like_this": {"fields": fields, "like": docs, "min_term_freq": min_term_freq, "max_query_terms": max_query_terms, "min_doc_freq": min_doc_freq, "max_doc_freq": max_doc_freq, "boost_terms": 1., "minimum_should_match": minimum_should_match, "include": True}}} if _from is not None and _from < total: mlt_query['from'] = _from if _size is not None: mlt_query['size'] = _size # Make the new query and return r_mlt = requests.post(url, data=json.dumps(dict(**query, **mlt_query)), headers=event['headers'], params={"search_type": "dfs_query_then_fetch"}) # If successful, return return format_response(r_mlt)