def es_aggsop(es, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY schema = frum.schema select = listwrap(query.select) es_query = Data() new_select = Data() # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": if schema.query_path == ".": s.pull = jx_expression_to_function("doc_count") else: s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]}) elif isinstance(s.value, Variable): if s.aggregate == "count": new_select["count_"+literal_field(s.value.var)] += [s] else: new_select[literal_field(s.value.var)] += [s] elif s.aggregate: formula.append(s) for canonical_name, many in new_select.items(): for s in many: columns = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] for column in columns: cn = literal_field(column.es_column + "_count") if column.jx_type == EXISTS: canonical_names.append(cn + ".doc_count") es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}} else: canonical_names.append(cn+ ".value") es_query.aggs[cn].value_count.field = column.es_column if len(canonical_names) == 1: s.pull = jx_expression_to_function(canonical_names[0]) else: s.pull = jx_expression_to_function({"add": canonical_names}) elif s.aggregate == "median": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [percent] es_query.aggs[key].percentiles.tdigest.compression = 2 s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": canonical_names = [] for column in columns: cn = literal_field(column.es_column + "_cardinality") canonical_names.append(cn) es_query.aggs[cn].cardinality.field = column.es_column if len(columns) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0}) elif s.aggregate == "stats": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = columns[0].es_column # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") es_query.aggs[median_name].percentiles.field = columns[0].es_column es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": pulls = [] for column in columns: script = {"scripted_metric": { 'init_script': 'params._agg.terms = new HashSet()', 'map_script': 'for (v in doc['+quote(column.es_column)+'].values) params._agg.terms.add(v);', 'combine_script': 'return params._agg.terms.toArray()', 'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()', }} stats_name = encode_property(column.es_column) if column.nested_path[0] == ".": es_query.aggs[stats_name] = script pulls.append(jx_expression_to_function(stats_name + ".value")) else: es_query.aggs[stats_name] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": script} } pulls.append(jx_expression_to_function(stats_name + "._nested.value")) if len(pulls) == 0: s.pull = NULL elif len(pulls) == 1: s.pull = pulls[0] else: s.pull = lambda row: UNION(p(row) for p in pulls) else: if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") elif len(columns) <1: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function({"null":{}}) else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = columns[0].es_column s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]}) for i, s in enumerate(formula): canonical_name = literal_field(s.name) if isinstance(s.value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" elif s.aggregate in ('max', 'maximum', 'min', 'minimum'): if s.aggregate in ('max', 'maximum'): dir = 1 op = "max" else: dir = -1 op = 'min' nully = TupleOp("tuple", [NULL]*len(s.value.terms)).partial_eval().to_es_script(schema).expr selfy = s.value.partial_eval().to_es_script(schema).expr script = {"scripted_metric": { 'init_script': 'params._agg.best = ' + nully + ';', 'map_script': 'params._agg.best = ' + expand_template(MAX_OF_TUPLE, {"expr1": "params._agg.best", "expr2": selfy, "dir": dir, "op": op}) + ";", 'combine_script': 'return params._agg.best', 'reduce_script': 'return params._aggs.stream().max(' + expand_template(COMPARE_TUPLE, {"dir": dir, "op": op}) + ').get()', }} if schema.query_path[0] == ".": es_query.aggs[canonical_name] = script s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") else: es_query.aggs[canonical_name] = { "nested": {"path": schema.query_path[0]}, "aggs": {"_nested": script} } s.pull = jx_expression_to_function(literal_field(canonical_name) + "._nested.value") else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es_script(schema).script(schema) s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(key + ".value") elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = s.value.to_es_script(schema).script(schema) # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(stats_name + ".buckets.key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate]) es_query.aggs[canonical_name].extended_stats.script = s.value.to_es_script(schema).script(schema) decoders = get_decoders_by_depth(query) start = 0 # <TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = AndOp("and", split_where[1]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( {"nested": {"path": schema.query_path[0]}}, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = AndOp("and", split_where[0]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es_post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", cause=e)
def es_aggsop(es, frum, query): select = wrap([s.copy() for s in listwrap(query.select)]) # [0] is a cheat; each es_column should be a dict of columns keyed on type, like in sqlite es_column_map = {v: frum.schema[v][0].es_column for v in query.vars()} es_query = Data() new_select = Data() #MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": s.pull = "doc_count" elif isinstance(s.value, Variable): if s.value.var == ".": if frum.typed: # STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING if s.aggregate in NON_STATISTICAL_AGGS: #TODO: HANDLE BOTH $value AND $objects TO COUNT Log.error("do not know how to handle") else: s.value.var = "$value" new_select["$value"] += [s] else: if s.aggregate in NON_STATISTICAL_AGGS: #TODO: WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT Log.error("do not know how to handle") else: Log.error('Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate) elif s.aggregate == "count": s.value = s.value.map(es_column_map) new_select["count_"+literal_field(s.value.var)] += [s] else: s.value = s.value.map(es_column_map) new_select[literal_field(s.value.var)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): representative = many[0] if representative.value.var == ".": Log.error("do not know how to handle") else: field_name = representative.value.var # canonical_name=literal_field(many[0].name) for s in many: if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.field = field_name s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, basestring) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = literal_field(canonical_name + " cardinality") es_query.aggs[key].cardinality.field = field_name s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = field_name # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.field = field_name es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.field = field_name es_query.aggs[stats_name].terms.size = Math.min(s.limit, MAX_LIMIT) s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = field_name s.pull = literal_field(canonical_name) + "." + aggregates1_4[s.aggregate] for i, s in enumerate(formula): canonical_name = literal_field(s.name) abs_value = s.value.map(es_column_map) if isinstance(abs_value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = abs_value.to_ruby() s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = abs_value.to_ruby() s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = abs_value.to_ruby() # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = abs_value.to_ruby() es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate=="union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = abs_value.to_ruby() s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = canonical_name + "." + aggregates1_4[s.aggregate] es_query.aggs[canonical_name].extended_stats.script = abs_value.to_ruby() decoders = get_decoders_by_depth(query) start = 0 vars_ = query.where.vars() #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = simplify_esfilter(AndOp("and", split_where[1]).to_esfilter()) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( { "nested": { "path": frum.query_path } }, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = simplify_esfilter(AndOp("and", split_where[0]).to_esfilter()) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)
def es_aggsop(es, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY schema = frum.schema select = listwrap(query.select) es_query = Data() new_select = Data() # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": if schema.query_path == ".": s.pull = jx_expression_to_function("doc_count") else: s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]}) elif isinstance(s.value, Variable): if s.aggregate == "count": new_select["count_"+literal_field(s.value.var)] += [s] else: new_select[literal_field(s.value.var)] += [s] elif s.aggregate: formula.append(s) for canonical_name, many in new_select.items(): for s in many: columns = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] for column in columns: cn = literal_field(column.es_column + "_count") if column.jx_type == EXISTS: canonical_names.append(cn + ".doc_count") es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}} else: canonical_names.append(cn+ ".value") es_query.aggs[cn].value_count.field = column.es_column if len(canonical_names) == 1: s.pull = jx_expression_to_function(canonical_names[0]) else: s.pull = jx_expression_to_function({"add": canonical_names}) elif s.aggregate == "median": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": canonical_names = [] for column in columns: cn = literal_field(column.es_column + "_cardinality") canonical_names.append(cn) es_query.aggs[cn].cardinality.field = column.es_column if len(columns) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0}) elif s.aggregate == "stats": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = columns[0].es_column # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") es_query.aggs[median_name].percentiles.field = columns[0].es_column es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": pulls = [] for column in columns: script = {"scripted_metric": { 'init_script': 'params._agg.terms = new HashSet()', 'map_script': 'for (v in doc['+quote(column.es_column)+'].values) params._agg.terms.add(v)', 'combine_script': 'return params._agg.terms.toArray()', 'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()', }} stats_name = encode_property(column.es_column) if column.nested_path[0] == ".": es_query.aggs[stats_name] = script pulls.append(jx_expression_to_function(stats_name + ".value")) else: es_query.aggs[stats_name] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": script} } pulls.append(jx_expression_to_function(stats_name + "._nested.value")) if len(pulls) == 0: s.pull = NULL elif len(pulls) == 1: s.pull = pulls[0] else: s.pull = lambda row: UNION(p(row) for p in pulls) else: if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") elif len(columns) <1: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function({"null":{}}) else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = columns[0].es_column s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]}) for i, s in enumerate(formula): canonical_name = literal_field(s.name) if isinstance(s.value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" elif s.aggregate in ('max', 'maximum', 'min', 'minimum'): if s.aggregate in ('max', 'maximum'): dir = 1 op = "max" else: dir = -1 op = 'min' nully = TupleOp("tuple", [NULL]*len(s.value.terms)).partial_eval().to_es_script(schema).expr selfy = s.value.partial_eval().to_es_script(schema).expr script = {"scripted_metric": { 'init_script': 'params._agg.best = ' + nully + ';', 'map_script': 'params._agg.best = ' + expand_template(MAX_OF_TUPLE, {"expr1": "params._agg.best", "expr2": selfy, "dir": dir, "op": op}) + ";", 'combine_script': 'return params._agg.best', 'reduce_script': 'return params._aggs.stream().max(' + expand_template(COMPARE_TUPLE, {"dir": dir, "op": op}) + ').get()', }} if schema.query_path[0] == ".": es_query.aggs[canonical_name] = script s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") else: es_query.aggs[canonical_name] = { "nested": {"path": schema.query_path[0]}, "aggs": {"_nested": script} } s.pull = jx_expression_to_function(literal_field(canonical_name) + "._nested.value") else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es_script(schema).script(schema) s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(key + ".value") elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = s.value.to_es_script(schema).script(schema) # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(stats_name + ".buckets.key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate]) es_query.aggs[canonical_name].extended_stats.script = s.value.to_es_script(schema).script(schema) decoders = get_decoders_by_depth(query) start = 0 #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = AndOp("and", split_where[1]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( {"nested": {"path": schema.query_path[0]}}, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = AndOp("and", split_where[0]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es_post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", cause=e)
def es_aggsop(es, frum, query): select = wrap([s.copy() for s in listwrap(query.select)]) es_column_map = { c.name: unwraplist(c.es_column) for c in frum.schema.columns } es_query = Data() new_select = Data( ) #MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance( s.value, Variable) and s.value.var == ".": s.pull = "doc_count" elif isinstance(s.value, Variable): if s.value.var == ".": if frum.typed: # STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING if s.aggregate in NON_STATISTICAL_AGGS: #TODO: HANDLE BOTH $value AND $objects TO COUNT Log.error("do not know how to handle") else: s.value.var = "$value" new_select["$value"] += [s] else: if s.aggregate in NON_STATISTICAL_AGGS: #TODO: WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT Log.error("do not know how to handle") else: Log.error( 'Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate) elif s.aggregate == "count": s.value = s.value.map(es_column_map) new_select["count_" + literal_field(s.value.var)] += [s] else: s.value = s.value.map(es_column_map) new_select[literal_field(s.value.var)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): representative = many[0] if representative.value.var == ".": Log.error("do not know how to handle") else: field_name = representative.value.var # canonical_name=literal_field(many[0].name) for s in many: if s.aggregate == "count": es_query.aggs[literal_field( canonical_name)].value_count.field = field_name s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance( s.percentile, basestring) or s.percetile < 0 or 1 < s.percentile: Log.error( "Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = literal_field(canonical_name + " cardinality") es_query.aggs[key].cardinality.field = field_name s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = field_name # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.field = field_name es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.field = field_name es_query.aggs[stats_name].terms.size = Math.min( s.limit, MAX_LIMIT) s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field( canonical_name)].extended_stats.field = field_name s.pull = literal_field(canonical_name) + "." + aggregates1_4[ s.aggregate] for i, s in enumerate(formula): canonical_name = literal_field(s.name) abs_value = s.value.map(es_column_map) if s.aggregate == "count": es_query.aggs[literal_field( canonical_name)].value_count.script = abs_value.to_ruby() s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = abs_value.to_ruby() s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[ stats_name].extended_stats.script = abs_value.to_ruby() # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = abs_value.to_ruby() es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = abs_value.to_ruby() s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = canonical_name + "." + aggregates1_4[s.aggregate] es_query.aggs[ canonical_name].extended_stats.script = abs_value.to_ruby() decoders = get_decoders_by_depth(query) start = 0 vars_ = query.where.vars() #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema, map_=es_column_map) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = simplify_esfilter( AndOp("and", split_where[1]).to_esfilter()) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)}) es_query = wrap({ "aggs": { "_nested": set_default({"nested": { "path": frum.query_path }}, es_query) } }) else: if any(split_where[1::]): Log.error("Where clause is too deep") for d in decoders[0]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = simplify_esfilter(AndOp("and", split_where[0]).to_esfilter()) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)}) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce( result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[ query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)