def clio_search_iter(url, index, chunksize=1000, scroll='1m', **kwargs): """Perform a *bulk* (streamed) contextual search of Elasticsearch data. Args: url (str): URL path to bare ES endpoint. index (str): Index to query. chunksize (int): Chunk size to retrieve from Elasticsearch. query (str): The simple text query to Elasticsearch. fields (list): List of fields to query. n_seed_docs (int): Number of seed documents to retrieve. min_term_freq (int): Only consider seed terms which occur in all documents with this frequency. max_query_terms (int): Maximum number of important terms to identify in the seed documents. min_doc_frac (float): Only consider seed terms which appear more than this fraction of the seed docs. max_doc_frac (float): Only consider seed terms which appear less than this fraction of the seed docs. min_should_match (float): Fraction of important terms from the seed docs explicitly required to match. {pre,post}_filters (list): ES filters to supply to the {seed,expanded} queries. stop_words (list): A supplementary list of terms to ignore. Defaults to standard English stop words. scroll (str): ES scroll time window (e.g. '1m'). Yields: Single rows of data """ try_pop(kwargs, 'limit') # Ignore limit and offset try_pop(kwargs, 'offset') if chunksize > MAX_CHUNKSIZE: logging.warning( f'Will not consider chunksize greater than {MAX_CHUNKSIZE}. ' f'Reverting to chunksize={MAX_CHUNKSIZE}.') # First search scroll_id, docs = clio_search(url=url, index=index, limit=chunksize, scroll=scroll, **kwargs) for row in docs: yield row # Keep scrolling if required endpoint = urllib.parse.urljoin(f'{url}/', '_search/scroll') while len(docs) == chunksize: r = requests.post(endpoint, data=json.dumps({ 'scroll': scroll, 'scroll_id': scroll_id }), headers={'Content-Type': 'application/json'}) _, docs = extract_docs(r) for row in docs: yield row
def test_try_pop(): data = {'a': 'A', 'b': 'B', 'c': 'C'} assert try_pop(data, 'a', 'AA') == 'A' assert try_pop(data, 'b', 'BB') == 'B' assert try_pop(data, 'c', 'CC') == 'C' assert try_pop(data, 'a', 'BB') == 'BB' assert try_pop(data, 'b', 'AB') == 'AB' assert try_pop(data, 'c', 'CA') == 'CA' assert try_pop(data, 'd', 'DD') == 'DD' assert try_pop(data, 'd') is None
def lambda_handler(event, context=None): """The 'main' function: Process the API Gateway Event passed to Lambda by performing an expansion on the original ES query.""" query = json.loads(event['body']) # Strip out any extreme upper limits from the post_filter try: post_filter = query['post_filter'] except KeyError: pass else: if 'range' in post_filter: pop_upper_lim(post_filter['range']) elif 'bool' in post_filter: for row in post_filter['bool']['must']: if 'range' not in row: continue pop_upper_lim(row['range']) # Fixes new host-authentication permissions, as this value # won't match the lambda host if 'Host' in event['headers']: event['headers'].pop('Host') # Generate the endpoint URL, and validate endpoint = event['headers'].pop('es-endpoint') if endpoint not in os.environ['ALLOWED_ENDPOINTS'].split(";"): raise ValueError(f'{endpoint} has not been registered') slug = event['pathParameters']['proxy'] # If not a search query, return if not slug.endswith("_search") or 'query' not in query: url = f"https://{endpoint}/{slug}" r = requests.post(url, data=json.dumps(query), params={"rest_total_hits_as_int": "true"}, headers=event['headers']) return format_response(r) # Convert the request info ready for clio_search index = slug[:-8] # removes "/_search" limit = try_pop(query, 'size') offset = try_pop(query, 'from') min_term_freq = try_pop(query, 'min_term_freq', 1) max_query_terms = try_pop(query, 'max_query_terms', 10) min_doc_frac = try_pop(query, 'min_doc_frac', 0.001) max_doc_frac = try_pop(query, 'max_doc_frac', 0.90) min_should_match = try_pop(query, 'minimum_should_match', 0.2) old_query = deepcopy(try_pop(query, 'query')) fields = extract_fields(old_query) # Make the search _, r = clio_search(f"https://{endpoint}", index, old_query, fields=fields, limit=limit, offset=offset, min_term_freq=min_term_freq, max_query_terms=max_query_terms, min_doc_frac=min_doc_frac, max_doc_frac=max_doc_frac, min_should_match=min_should_match, post_aggregation=query, response_mode=True, headers=event['headers']) return format_response(r)
def lambda_handler(event, context=None): """The 'main' function: Process the API Gateway Event passed to Lambda by performing an expansion on the original ES query.""" query = json.loads(event['body']) # Strip out any extreme upper limits from the post_filter try: post_filter = query['post_filter'] except KeyError: pass else: print(post_filter) if 'range' in post_filter: pop_upper_lim(post_filter['range']) elif 'bool' in post_filter: for row in post_filter['bool']['must']: if 'range' not in row: continue pop_upper_lim(row['range']) # Generate the endpoint URL, and validate endpoint = event['headers'].pop('es-endpoint') if endpoint not in os.environ['ALLOWED_ENDPOINTS'].split(";"): raise ValueError(f'{endpoint} has not been registered') url = f"https://{endpoint}/{event['pathParameters']['proxy']}" # If not a search query, return if not url.endswith("_search") or 'query' not in query: r = requests.post(url, data=json.dumps(query), headers=event['headers']) return format_response(r) # Extract info from the query as required _from = try_pop(query, 'from') _size = try_pop(query, 'size') min_term_freq = try_pop(query, 'min_term_freq', 1) max_query_terms = try_pop(query, 'max_query_terms', 10) min_doc_freq = try_pop(query, 'min_doc_freq', 0.001) max_doc_frac = try_pop(query, 'max_doc_frac', 0.90) minimum_should_match = try_pop(query, 'minimum_should_match', '20%') # Make the initial request old_query = deepcopy(try_pop(query, 'query')) fields = extract_fields(old_query) r = simple_query(url, old_query, event, fields) total, docs = extract_docs(r) # If no results, give up if total == 0: return format_response(r) # Formulate the MLT query max_doc_freq = int(max_doc_frac*total) min_doc_freq = int(min_doc_freq*total) mlt_query = {"query": {"more_like_this": {"fields": fields, "like": docs, "min_term_freq": min_term_freq, "max_query_terms": max_query_terms, "min_doc_freq": min_doc_freq, "max_doc_freq": max_doc_freq, "boost_terms": 1., "minimum_should_match": minimum_should_match, "include": True}}} if _from is not None and _from < total: mlt_query['from'] = _from if _size is not None: mlt_query['size'] = _size # Make the new query and return r_mlt = requests.post(url, data=json.dumps(dict(**query, **mlt_query)), headers=event['headers'], params={"search_type": "dfs_query_then_fetch"}) # If successful, return return format_response(r_mlt)