def append_query(self, es_query, start): self.start = start parts = self.edge.domain.partitions filters = [] notty = [] for p in parts: w = p.where filters.append( AndOp("and", [w] + notty).to_es14_filter(self.schema)) notty.append(NotOp("not", w)) missing_filter = None if self.edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER missing_filter = set_default( {"filter": AndOp("and", notty).to_es14_filter(self.schema)}, es_query) return wrap({ "aggs": { "_match": set_default({"filters": { "filters": filters }}, es_query), "_missing": missing_filter } })
def append_query(self, es_query, start): self.start = start domain = self.domain domain_key = domain.key include, text_include = transpose( *((float(v) if isinstance(v, (int, float)) else v, text_type(float(v)) if isinstance(v, (int, float)) else v) for v in (p[domain_key] for p in domain.partitions))) value = self.edge.value exists = AndOp( "and", [value.exists(), InOp("in", [value, Literal("literal", include)])]).partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if isinstance(value, Variable): es_field = first(self.query.frum.schema.leaves( value.var)).es_column # ALREADY CHECKED THERE IS ONLY ONE terms = set_default( { "terms": { "field": es_field, "size": limit, "order": { "_term": self.sorted } if self.sorted else None } }, es_query) else: terms = set_default( { "terms": { "script": value.to_es14_script(self.schema).script(self.schema), "size": limit } }, es_query) if self.edge.allowNulls: missing = set_default( {"filter": NotOp("not", exists).to_es14_filter(self.schema)}, es_query) else: missing = None return wrap({ "aggs": { "_match": { "filter": exists.to_es14_filter(self.schema), "aggs": { "_filter": terms } }, "_missing": missing } })
def append_query(self, es_query, start): self.start = start domain = self.domain domain_key = domain.key include, text_include = transpose(*( ( float(v) if isinstance(v, (int, float)) else v, text_type(float(v)) if isinstance(v, (int, float)) else v ) for v in (p[domain_key] for p in domain.partitions) )) value = self.edge.value exists = AndOp("and", [ value.exists(), InOp("in", [value, Literal("literal", include)]) ]).partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if isinstance(value, Variable): es_field = self.query.frum.schema.leaves(value.var)[0].es_column # ALREADY CHECKED THERE IS ONLY ONE terms = set_default({"terms": { "field": es_field, "size": limit, "order": {"_term": self.sorted} if self.sorted else None }}, es_query) else: terms = set_default({"terms": { "script": value.to_es_script(self.schema).script(self.schema), "size": limit }}, es_query) if self.edge.allowNulls: missing = set_default( {"filter": NotOp("not", exists).to_esfilter(self.schema)}, es_query ) else: missing = None return wrap({"aggs": { "_match": { "filter": exists.to_esfilter(self.schema), "aggs": { "_filter": terms } }, "_missing": missing }})
def _range_composer(edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if edge.allowNulls: missing_filter = set_default( { "filter": NotOp("not", AndOp("and", [ edge.value.exists(), InequalityOp("gte", [edge.value, Literal(None, to_float(_min))]), InequalityOp("lt", [edge.value, Literal(None, to_float(_max))]) ]).partial_eval()).to_esfilter(schema) }, es_query ) else: missing_filter = None if isinstance(edge.value, Variable): calc = {"field": schema.leaves(edge.value.var)[0].es_column} else: calc = {"script": edge.value.to_es_script(schema).script(schema)} return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": missing_filter }})
def append_query(self, es_query, start): self.start = start edge = self.edge range = edge.range domain = edge.domain aggs = {} for i, p in enumerate(domain.partitions): filter_ = AndOp("and", [ InequalityOp("lte", [range.min, Literal("literal", self.to_float(p.min))]), InequalityOp("gt", [range.max, Literal("literal", self.to_float(p.min))]) ]) aggs["_join_" + text_type(i)] = set_default( {"filter": filter_.to_esfilter(self.schema)}, es_query ) return wrap({"aggs": aggs})
def es_aggsop(es, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY schema = frum.schema select = listwrap(query.select) es_query = Data() new_select = Data() # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": if schema.query_path == ".": s.pull = jx_expression_to_function("doc_count") else: s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]}) elif isinstance(s.value, Variable): if s.aggregate == "count": new_select["count_"+literal_field(s.value.var)] += [s] else: new_select[literal_field(s.value.var)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): for s in many: es_cols = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] for es_col in es_cols: cn = literal_field(es_col.es_column + "_count") canonical_names.append(cn) es_query.aggs[cn].value_count.field = es_col.es_column if len(es_cols) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names]}) elif s.aggregate == "median": if len(es_cols) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = es_cols[0].es_column es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\.0") elif s.aggregate == "percentile": if len(es_cols) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = es_cols[0].es_column es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": canonical_names = [] for es_col in es_cols: cn = literal_field(es_col.es_column + "_cardinality") canonical_names.append(cn) es_query.aggs[cn].cardinality.field = es_col.es_column if len(es_cols) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0}) elif s.aggregate == "stats": if len(es_cols) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = es_cols[0].es_column # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") es_query.aggs[median_name].percentiles.field = es_cols[0].es_column es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": pulls = [] for es_col in es_cols: stats_name = encode_property(es_col.es_column) if es_col.nested_path[0] == ".": es_query.aggs[stats_name] = {"terms": { "field": es_col.es_column, "size": Math.min(s.limit, MAX_LIMIT) }} pulls.append(get_bucket_keys(stats_name)) else: es_query.aggs[stats_name] = { "nested": {"path": es_col.nested_path[0]}, "aggs": {"_nested": {"terms": { "field": es_col.es_column, "size": Math.min(s.limit, MAX_LIMIT) }}} } pulls.append(get_bucket_keys(stats_name+"._nested")) if len(pulls) == 0: s.pull = NULL elif len(pulls) == 1: s.pull = pulls[0] else: s.pull = lambda row: UNION( p(row) for p in pulls ) else: if len(es_cols) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = es_cols[0].es_column s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]}) for i, s in enumerate(formula): canonical_name = literal_field(s.name) if isinstance(s.value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_ruby(schema).script(schema) s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = s.value.to_ruby(schema).script(schema) es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\.0") elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = s.value.to_ruby(schema).script(schema) es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = s.value.to_ruby(schema).script(schema) s.pull = jx_expression_to_function(key + ".value") elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = s.value.to_ruby(schema).script(schema) # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = s.value.to_ruby(schema).script(schema) es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate=="union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = s.value.to_ruby(schema).script(schema) s.pull = jx_expression_to_function(stats_name + ".buckets.key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate]) es_query.aggs[canonical_name].extended_stats.script = s.value.to_ruby(schema).script(schema) decoders = get_decoders_by_depth(query) start = 0 #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = AndOp("and", split_where[1]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( { "nested": { "path": schema.query_path } }, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = AndOp("and", split_where[0]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es_post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", cause=e)
def es_deepop(es, query): schema = query.frum.schema columns = schema.columns query_path = schema.query_path # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es_query_template(query_path) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, schema) for i, f in enumerate(es_filters): script = AndOp("and", wheres[i]).partial_eval().to_esfilter(schema) set_default(f, script) if not wheres[1]: # WITHOUT NESTED CONDITIONS, WE MUST ALSO RETURN DOCS WITH NO NESTED RECORDS more_filter = { "and": [ es_filters[0], { "missing": { "field": untype_path(query_path) + "." + EXISTS_TYPE } } ] } else: more_filter = None es_query.size = coalesce(query.limit, DEFAULT_LIMIT) # es_query.sort = jx_sort_to_es_sort(query.sort) map_to_es_columns = schema.map_to_es() # {c.names["."]: c.es_column for c in schema.leaves(".")} query_for_es = query.map(map_to_es_columns) es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema) es_query.fields = [] is_list = isinstance(query.select, list) new_select = FlatList() i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp) and isinstance( s.value.term, Variable): # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS leaves = schema.leaves(s.value.term.var) col_names = set() for c in leaves: if c.nested_path[0] == ".": if c.type == NESTED: continue es_query.fields += [c.es_column] c_name = untype_path(c.names[query_path]) col_names.add(c_name) new_select.append({ "name": concat_field(s.name, c_name), "nested_path": c.nested_path[0], "put": { "name": concat_field(s.name, literal_field(c_name)), "index": i, "child": "." }, "pull": get_pull_function(c) }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS for n in new_select: if n.name.startswith("..") and n.name.lstrip( ".") not in col_names: n.put.name = n.name = n.name.lstrip(".") col_names.add(n.name) elif isinstance(s.value, Variable): net_columns = schema.leaves(s.value.var) if not net_columns: new_select.append({ "name": s.name, "nested_path": ".", "put": { "name": s.name, "index": i, "child": "." }, "pull": NULL }) else: for n in net_columns: pull = get_pull_function(n) if n.nested_path[0] == ".": if n.type == NESTED: continue es_query.fields += [n.es_column] # WE MUST FIGURE OUT WHICH NAMESSPACE s.value.var IS USING SO WE CAN EXTRACT THE child for np in n.nested_path: c_name = untype_path(n.names[np]) if startswith_field(c_name, s.value.var): child = relative_field(c_name, s.value.var) break else: child = relative_field( untype_path(n.names[n.nested_path[0]]), s.value.var) new_select.append({ "name": s.name, "pull": pull, "nested_path": n.nested_path[0], "put": { "name": s.name, "index": i, "child": child } }) i += 1 else: expr = s.value for v in expr.vars(): for c in schema[v]: if c.nested_path[0] == ".": es_query.fields += [c.es_column] # else: # Log.error("deep field not expected") pull_name = EXPRESSION_PREFIX + s.name map_to_local = { untype_path(k): get_pull(cc) for k, c in schema.lookup.items() for cc in c if cc.type not in STRUCT } pull = jx_expression_to_function(pull_name) post_expressions[pull_name] = compile_expression( expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.__data__(), "put": { "name": s.name, "index": i, "child": "." } }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append( es_post( es, Data(query={"filtered": { "filter": more_filter }}, fields=es_query.fields), query.limit)) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)