def append_query(self, query_path, es_query): domain = self.domain domain_key = domain.key value = self.edge.value cnv = pull_functions[value.type] include = tuple(cnv(p[domain_key]) for p in domain.partitions) exists = AndOp("and", [ InOp("in", [value, Literal("literal", include)]) ]).partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if isinstance(value, Variable): es_field = first(self.query.frum.schema.leaves(value.var)).es_column # ALREADY CHECKED THERE IS ONLY ONE match = TermsAggs( "_match", { "field": es_field, "size": limit, "order": {"_term": self.sorted} if self.sorted else None }, self ) else: match = TermsAggs( "_match", { "script": { "lang": "painless", "inline": value.to_es_script(self.schema).script(self.schema) }, "size": limit }, self ) output = Aggs().add(FilterAggs("_filter", exists, None).add(match.add(es_query))) if self.edge.allowNulls: # FIND NULLS AT EACH NESTED LEVEL for p in self.schema.query_path: if p == query_path: # MISSING AT THE QUERY DEPTH output.add( NestedAggs(p).add(FilterAggs("_missing0", NotOp(None, exists), self).add(es_query)) ) else: # PARENT HAS NO CHILDREN, SO MISSING column = first(self.schema.values(query_path, (OBJECT, EXISTS))) output.add( NestedAggs(column.nested_path[0]).add( FilterAggs( "_missing1", NotOp(None, ExistsOp(None, Variable(column.es_column.replace(NESTED_TYPE, EXISTS_TYPE)))), self ).add(es_query) ) ) return output
def append_query(self, query_path, es_query): if isinstance(self.edge.value, FirstOp) and isinstance(self.edge.value.term, Variable): self.edge.value = self.edge.value.term # ES USES THE FIRST TERM FOR {"terms": } AGGREGATION if not isinstance(self.edge.value, Variable): terms = TermsAggs( "_match", { "script": {"lang": "painless", "inline": self.script.expr}, "size": self.domain.limit, "order": self.es_order }, self ) else: terms = TermsAggs( "_match", { "field": first(self.schema.leaves(self.edge.value.var)).es_column, "size": self.domain.limit, "order": self.es_order }, self ) output = Aggs() output.add(FilterAggs("_filter", self.exists, None).add(terms.add(es_query))) output.add(FilterAggs("_missing", self.missing, self).add(es_query)) return output
def leaves(self, prefix): head = self.namespace.get(prefix, None) if not head: return Null full_name = first(head).name return set( c for k, cs in self.namespace.items() if startswith_field(k, full_name) and k != GUID or k == full_name for c in cs if c.jx_type not in [OBJECT, EXISTS])
def get_decoders_by_path(query): """ RETURN MAP FROM QUERY PATH TO LIST OF DECODER ARRAYS :param query: :return: """ schema = query.frum.schema output = Data() if query.edges: if query.sort and query.format != "cube": # REORDER EDGES/GROUPBY TO MATCH THE SORT query.edges = sort_edges(query, "edges") elif query.groupby: if query.sort and query.format != "cube": query.groupby = sort_edges(query, "groupby") for edge in wrap(coalesce(query.edges, query.groupby, [])): limit = coalesce(edge.domain.limit, query.limit, DEFAULT_LIMIT) if edge.value != None and not isinstance(edge.value, NullOp): edge = edge.copy() vars_ = edge.value.vars() for v in vars_: if not schema.leaves(v.var): Log.error("{{var}} does not exist in schema", var=v) elif edge.range: vars_ = edge.range.min.vars() | edge.range.max.vars() for v in vars_: if not schema[v.var]: Log.error("{{var}} does not exist in schema", var=v) elif edge.domain.dimension: vars_ = edge.domain.dimension.fields edge.domain.dimension = edge.domain.dimension.copy() edge.domain.dimension.fields = [schema[v].es_column for v in vars_] elif all(edge.domain.partitions.where): vars_ = set() for p in edge.domain.partitions: vars_ |= p.where.vars() vars_ |= edge.value.vars() depths = set(c.nested_path[0] for v in vars_ for c in schema.leaves(v.var)) if not depths: Log.error("Do not know of column {{column}}", column=unwraplist( [v for v in vars_ if schema[v] == None])) if len(depths) > 1: Log.error("expression {{expr|quote}} spans tables, can not handle", expr=edge.value) decoder = AggsDecoder(edge, query, limit) output[literal_field(first(depths))] += [decoder] return output
def append_query(self, query_path, es_query): decoder = self for i, v in enumerate(self.fields): exists = v.exists().partial_eval() nest = Aggs() nest.add(TermsAggs("_match", { "field": first(self.schema.leaves(v.var)).es_column, "size": self.domain.limit }, decoder).add(es_query)) nest.add(FilterAggs("_missing", NotOp("not", exists), decoder).add(es_query)) es_query = nest decoder = None if self.domain.where: es_query = FilterAggs("_filter", self.domain.where, None).add(es_query) return es_query
def _range_composer(self, edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) output = Aggs() if edge.allowNulls: output.add(FilterAggs( "_missing", NotOp("not", AndOp("and", [ edge.value.exists(), InequalityOp("gte", [edge.value, Literal(None, to_float(_min))]), InequalityOp("lt", [edge.value, Literal(None, to_float(_max))]) ]).partial_eval()), self ).add(es_query)) if isinstance(edge.value, Variable): calc = {"field": first(schema.leaves(edge.value.var)).es_column} else: calc = {"script": edge.value.to_es_script(schema).script(schema)} calc['ranges'] = [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions] return output.add(RangeAggs("_match", calc, self).add(es_query))
def append_query(self, query_path, es_query): es_field = first(self.query.frum.schema.leaves(self.var)).es_column return Aggs().add(TermsAggs("_match", { "script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'}) }, self).add(es_query))
def __new__(cls, e=None, query=None, *args, **kwargs): e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": # if query.groupby: # return object.__new__(DefaultDecoder, e) if isinstance(e.value, text_type): Log.error("Expecting Variable or Expression, not plain string") if isinstance(e.value, LeavesOp): return object.__new__(ObjectDecoder) elif isinstance(e.value, TupleOp): # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS if not all(isinstance(t, Variable) for t in e.value.terms): Log.error("Can only handle variables in tuples") e.domain = Data( dimension={"fields": e.value.terms} ) return object.__new__(DimFieldListDecoder) elif isinstance(e.value, Variable): schema = query.frum.schema cols = schema.leaves(e.value.var) if not cols: return object.__new__(DefaultDecoder) if len(cols) != 1: return object.__new__(ObjectDecoder) col = first(cols) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.cardinality == None: DEBUG and Log.warning( "metadata for column {{name|quote}} (id={{id}}) is not ready", name=concat_field(col.es_index, col.es_column), id=id(col) ) e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) elif col.partitions == None: e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) else: DEBUG and Log.note("id={{id}} has parts!!!", id=id(col)) if col.multi > 1 and len(col.partitions) < 10: return object.__new__(MultivalueDecoder) partitions = col.partitions[:limit:] if e.domain.sort == -1: partitions = list(reversed(sorted(partitions))) else: partitions = sorted(partitions) e.domain = SimpleSetDomain(partitions=partitions, limit=limit) else: return object.__new__(DefaultDecoder) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder) if e.range: return object.__new__(GeneralRangeDecoder) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if isinstance(fields, Mapping): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def es_aggsop(es, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY schema = frum.schema query_path = schema.query_path[0] select = listwrap(query.select) new_select = Data( ) # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if isinstance(s.value, Variable): s.query_path = query_path if s.aggregate == "count": new_select["count_" + literal_field(s.value.var)] += [s] else: new_select[literal_field(s.value.var)] += [s] elif s.aggregate: split_select = split_expression_by_path(s.value, schema) for si_key, si_value in split_select.items(): if si_value: if s.query_path: Log.error( "can not handle more than one depth per select") s.query_path = si_key formula.append(s) acc = Aggs() for _, many in new_select.items(): for s in many: canonical_name = s.name if s.aggregate in ("value_count", "count"): columns = frum.schema.values(s.value.var, exclude_type=(OBJECT, NESTED)) else: columns = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] for column in columns: es_name = column.es_column + "_count" if column.jx_type == EXISTS: if column.nested_path[0] == query_path: canonical_names.append("doc_count") acc.add( NestedAggs(column.nested_path[0]).add( ComplexAggs(s))) else: canonical_names.append("value") acc.add( NestedAggs(column.nested_path[0]).add( ExprAggs(es_name, { "value_count": { "field": column.es_column } }, s))) if len(canonical_names) == 1: s.pull = jx_expression_to_function(canonical_names[0]) else: s.pull = jx_expression_to_function( {"add": canonical_names}) elif s.aggregate == "median": if len(columns) > 1: Log.error( "Do not know how to count columns with more than one type (script probably)" ) # ES USES DIFFERENT METHOD FOR PERCENTILES key = canonical_name + " percentile" acc.add( ExprAggs( key, { "percentiles": { "field": first(columns).es_column, "percents": [50] } }, s)) s.pull = jx_expression_to_function("values.50\\.0") elif s.aggregate == "percentile": if len(columns) > 1: Log.error( "Do not know how to count columns with more than one type (script probably)" ) # ES USES DIFFERENT METHOD FOR PERCENTILES key = canonical_name + " percentile" if isinstance( s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile: Log.error( "Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) acc.add( ExprAggs( key, { "percentiles": { "field": first(columns).es_column, "percents": [percent], "tdigest": { "compression": 2 } } }, s)) s.pull = jx_expression_to_function( join_field(["values", text_type(percent)])) elif s.aggregate == "cardinality": for column in columns: path = column.es_column + "_cardinality" acc.add( ExprAggs(path, {"cardinality": { "field": column.es_column }}, s)) s.pull = jx_expression_to_function("value") elif s.aggregate == "stats": if len(columns) > 1: Log.error( "Do not know how to count columns with more than one type (script probably)" ) # REGULAR STATS stats_name = literal_field(canonical_name) complex = ComplexAggs(s).add( ExprAggs(canonical_name, { "extended_stats": { "field": first(columns).es_column } }, None)) # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") complex.add( ExprAggs( canonical_name + "_percentile", { "percentiles": { "field": first(columns).es_column, "percents": [50] } }, None)) acc.add(complex) s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": for column in columns: script = { "scripted_metric": { 'init_script': 'params._agg.terms = new HashSet()', 'map_script': 'for (v in doc[' + quote(column.es_column) + '].values) params._agg.terms.add(v);', 'combine_script': 'return params._agg.terms.toArray()', 'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()', } } stats_name = column.es_column acc.add( NestedAggs(column.nested_path[0]).add( ExprAggs(stats_name, script, s))) s.pull = jx_expression_to_function("value") elif s.aggregate == "count_values": # RETURN MAP FROM VALUE TO THE NUMBER OF TIMES FOUND IN THE DOCUMENTS # NOT A NESTED DOC, RATHER A MULTIVALUE FIELD for column in columns: script = { "scripted_metric": { 'params': { "_agg": {} }, 'init_script': 'params._agg.terms = new HashMap()', 'map_script': 'for (v in doc[' + quote(column.es_column) + '].values) params._agg.terms.put(v, Optional.ofNullable(params._agg.terms.get(v)).orElse(0)+1);', 'combine_script': 'return params._agg.terms', 'reduce_script': ''' HashMap output = new HashMap(); for (agg in params._aggs) { if (agg!=null){ for (e in agg.entrySet()) { String key = String.valueOf(e.getKey()); output.put(key, e.getValue() + Optional.ofNullable(output.get(key)).orElse(0)); } } } return output; ''' } } stats_name = encode_property(column.es_column) acc.add( NestedAggs(column.nested_path[0]).add( ExprAggs(stats_name, script, s))) s.pull = jx_expression_to_function("value") else: if not columns: s.pull = jx_expression_to_function(NULL) else: for c in columns: acc.add( NestedAggs(c.nested_path[0]).add( ExprAggs( canonical_name, {"extended_stats": { "field": c.es_column }}, s))) s.pull = jx_expression_to_function(aggregates[s.aggregate]) for i, s in enumerate(formula): s_path = [ k for k, v in split_expression_by_path(s.value, schema=schema).items() if v ] if len(s_path) == 0: # FOR CONSTANTS nest = NestedAggs(query_path) acc.add(nest) elif len(s_path) == 1: nest = NestedAggs(first(s_path)) acc.add(nest) else: Log.error("do not know how to handle") canonical_name = s.name if isinstance(s.value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = jx_expression_to_function("doc_count") elif s.aggregate in ('max', 'maximum', 'min', 'minimum'): if s.aggregate in ('max', 'maximum'): dir = 1 op = "max" else: dir = -1 op = 'min' nully = TupleOp("tuple", [NULL] * len( s.value.terms)).partial_eval().to_es_script(schema).expr selfy = s.value.partial_eval().to_es_script(schema).expr script = { "scripted_metric": { 'init_script': 'params._agg.best = ' + nully + ';', 'map_script': 'params._agg.best = ' + expand_template( MAX_OF_TUPLE, { "expr1": "params._agg.best", "expr2": selfy, "dir": dir, "op": op }) + ";", 'combine_script': 'return params._agg.best', 'reduce_script': 'return params._aggs.stream().' + op + '(' + expand_template(COMPARE_TUPLE, { "dir": dir, "op": op }) + ').get()', } } nest.add( NestedAggs(query_path).add( ExprAggs(canonical_name, script, s))) s.pull = jx_expression_to_function("value") else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": nest.add( ExprAggs( canonical_name, { "value_count": { "script": s.value.partial_eval().to_es_script(schema).script( schema) } }, s)) s.pull = jx_expression_to_function("value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") nest.add( ExprAggs( key, { "percentiles": { "script": s.value.to_es_script(schema).script(schema), "percents": [50] } }, s)) s.pull = jx_expression_to_function(join_field(["50.0"])) elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) nest.add( ExprAggs( key, { "percentiles": { "script": s.value.to_es_script(schema).script(schema), "percents": [percent] } }, s)) s.pull = jx_expression_to_function( join_field(["values", text_type(percent)])) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" nest.add( ExprAggs( key, { "cardinality": { "script": s.value.to_es_script(schema).script(schema) } }, s)) s.pull = jx_expression_to_function("value") elif s.aggregate == "stats": # REGULAR STATS stats_name = canonical_name nest.add( ComplexAggs(s).add( ExprAggs( stats_name, { "extended_stats": { "script": s.value.to_es_script(schema).script(schema) } }, None))) # GET MEDIAN TOO! median_name = canonical_name + " percentile" nest.add( ExprAggs( median_name, { "percentiles": { "script": s.value.to_es_script(schema).script(schema), "percents": [50] } }, s)) s.pull = get_pull_stats(None, stats_name, median_name) elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union nest.add( TermsAggs(canonical_name, { "script_field": s.value.to_es_script(schema).script(schema) }, s)) s.pull = jx_expression_to_function("key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(aggregates[s.aggregate]) nest.add( ExprAggs( canonical_name, { "extended_stats": { "script": s.value.to_es_script(schema).script(schema) } }, s)) acc = NestedAggs(query_path).add(acc) split_decoders = get_decoders_by_path(query) split_wheres = split_expression_by_path(query.where, schema=frum.schema) start = 0 decoders = [None] * (len(query.edges) + len(query.groupby)) paths = list(reversed(sorted(split_wheres.keys() | split_decoders.keys()))) for path in paths: literal_path = literal_field(path) decoder = split_decoders[literal_path] where = split_wheres[literal_path] for d in decoder: decoders[d.edge.dim] = d acc = d.append_query(path, acc) start += d.num_columns if where: acc = FilterAggs("_filter", AndOp("and", where), None).add(acc) acc = NestedAggs(path).add(acc) acc = NestedAggs('.').add(acc) acc = simplify(acc) es_query = wrap(acc.to_es(schema)) es_query.size = 0 with Timer("ES query time", silent=not DEBUG) as es_duration: result = es_post(es, es_query, query.limit) try: format_time = Timer("formatting", silent=not DEBUG) with format_time: # result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE aggs = unwrap(result.aggregations) formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[ query.format] if query.edges: output = formatter(aggs, acc, query, decoders, select) elif query.groupby: output = groupby_formatter(aggs, acc, query, decoders, select) else: output = aggop_formatter(aggs, acc, query, decoders, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", cause=e)