def append_query(self, es_query, start): self.start = start domain = self.domain domain_key = domain.key include, text_include = transpose( *((float(v) if isinstance(v, (int, float)) else v, text_type(float(v)) if isinstance(v, (int, float)) else v) for v in (p[domain_key] for p in domain.partitions))) value = self.edge.value exists = AndOp( "and", [value.exists(), InOp("in", [value, Literal("literal", include)])]).partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if isinstance(value, Variable): es_field = first(self.query.frum.schema.leaves( value.var)).es_column # ALREADY CHECKED THERE IS ONLY ONE terms = set_default( { "terms": { "field": es_field, "size": limit, "order": { "_term": self.sorted } if self.sorted else None } }, es_query) else: terms = set_default( { "terms": { "script": value.to_es14_script(self.schema).script(self.schema), "size": limit } }, es_query) if self.edge.allowNulls: missing = set_default( {"filter": NotOp("not", exists).to_es14_filter(self.schema)}, es_query) else: missing = None return wrap({ "aggs": { "_match": { "filter": exists.to_es14_filter(self.schema), "aggs": { "_filter": terms } }, "_missing": missing } })
def to_es14_filter(self, schema): if not self.suffix: return {"match_all": {}} elif isinstance(self.expr, Variable) and isinstance(self.suffix, Literal): var = first(schema.leaves(self.expr.var)).es_column return {"regexp": {var: ".*"+string2regexp(self.suffix.value)}} else: return ScriptOp("script", self.to_es14_script(schema).script(schema)).to_es14_filter(schema)
def to_es14_filter(self, schema): if not self.expr: return {"match_all": {}} elif isinstance(self.expr, Variable) and isinstance(self.prefix, Literal): var = first(schema.leaves(self.expr.var)).es_column return {"prefix": {var: self.prefix.value}} else: return ScriptOp("script", self.to_es14_script(schema).script(schema)).to_es14_filter(schema)
def to_es14_filter(self, schema): if isinstance(self.value, Variable): var = self.value.var cols = schema.leaves(var) if cols: var = first(cols).es_column return {"terms": {var: self.superset.value}} else: return ScriptOp("script", self.to_es14_script(schema).script(schema)).to_es14_filter(schema)
def to_es14_filter(self, schema): if isinstance(self.term, MissingOp) and isinstance(self.term.expr, Variable): v = self.term.expr.var cols = schema.leaves(v) if cols: v = first(cols).es_column return {"exists": {"field": v}} else: operand = self.term.to_es14_filter(schema) return es_not(operand)
def to_es14_filter(self, schema): if isinstance(self.pattern, Literal) and isinstance(self.var, Variable): cols = schema.leaves(self.var.var) if len(cols) == 0: return MATCH_NONE elif len(cols) == 1: return {"regexp": {first(cols).es_column: self.pattern.value}} else: Log.error("regex on not supported ") else: Log.error("regex only accepts a variable and literal pattern")
def to_es14_filter(self, schema): if isinstance(self.expr, Variable): cols = schema.leaves(self.expr.var) if not cols: return {"match_all": {}} elif len(cols) == 1: return es_missing(first(cols).es_column) else: return es_and([ es_missing(c.es_column) for c in cols ]) else: return ScriptOp("script", self.to_es14_script(schema).script(schema)).to_es14_filter(schema)
def to_es14_filter(self, schema): if isinstance(self.lhs, Variable) and isinstance(self.rhs, Literal): cols = schema.leaves(self.lhs.var) if not cols: lhs = self.lhs.var # HAPPENS DURING DEBUGGING, AND MAYBE IN REAL LIFE TOO elif len(cols) == 1: lhs = first(cols).es_column else: Log.error("operator {{op|quote}} does not work on objects", op=self.op) return {"range": {lhs: {self.op: self.rhs.value}}} else: script = self.to_es14_script(schema) if script.miss is not FALSE: Log.error("inequality must be decisive") return {"script": es_script(script.expr)}
def to_es14_filter(self, schema): if isinstance(self.lhs, Variable) and isinstance(self.rhs, Literal): lhs = self.lhs.var cols = schema.leaves(lhs) if cols: lhs = first(cols).es_column rhs = self.rhs.value if isinstance(rhs, list): if len(rhs) == 1: return {"term": {lhs: rhs[0]}} else: return {"terms": {lhs: rhs}} else: return {"term": {lhs: rhs}} else: return self.to_es14_script(schema).to_es14_filter(schema)
def _range_composer(edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if edge.allowNulls: missing_filter = set_default( { "filter": NotOp( "not", AndOp("and", [ edge.value.exists(), InequalityOp( "gte", [edge.value, Literal(None, to_float(_min))]), InequalityOp( "lt", [edge.value, Literal(None, to_float(_max))]) ]).partial_eval()).to_es14_filter(schema) }, es_query) else: missing_filter = None if isinstance(edge.value, Variable): calc = {"field": first(schema.leaves(edge.value.var)).es_column} else: calc = {"script": edge.value.to_es14_script(schema).script(schema)} return wrap({ "aggs": { "_match": set_default({"range": calc}, { "range": { "ranges": [{ "from": to_float(p.min), "to": to_float(p.max) } for p in domain.partitions] } }, es_query), "_missing": missing_filter } })
def append_query(self, es_query, start): self.start = start es_field = first(self.query.frum.schema.leaves(self.var)).es_column es_query = wrap({ "aggs": { "_match": set_default( { "terms": { "script": expand_template(LIST_TO_PIPE, { "expr": 'doc[' + quote(es_field) + '].values' }) } }, es_query) } }) return es_query
def to_es14_script(self, schema, not_null=False, boolean=False, many=True): if isinstance(self.expr, Variable): if self.expr.var == "_id": return EsScript(type=BOOLEAN, expr="false", frum=self) else: columns = schema.leaves(self.expr.var) if len(columns) == 1: return EsScript(type=BOOLEAN, expr="doc[" + quote(first(columns).es_column) + "].isEmpty()", frum=self) else: return AndOp("and", [ EsScript( type=BOOLEAN, expr="doc[" + quote(c.es_column) + "].isEmpty()", frum=self ) for c in columns ]).partial_eval().to_es14_script(schema) elif isinstance(self.expr, Literal): return self.expr.missing().to_es14_script(schema) else: return self.expr.missing().partial_eval().to_es14_script(schema)
def to_es14_filter(self, schema): if isinstance(self.lhs, Variable) and isinstance(self.rhs, Literal): rhs = self.rhs.value lhs = self.lhs.var cols = schema.leaves(lhs) if cols: lhs = first(cols).es_column if isinstance(rhs, list): if len(rhs) == 1: return {"term": {lhs: rhs[0]}} else: return {"terms": {lhs: rhs}} else: return {"term": {lhs: rhs}} else: return CaseOp("case", [ WhenOp("when", self.lhs.missing(), **{"then": self.rhs.missing()}), WhenOp("when", self.rhs.missing(), **{"then": FALSE}), BasicEqOp("eq", [self.lhs, self.rhs]) ]).partial_eval().to_es14_filter(schema)
def append_query(self, es_query, start): # TODO: USE "reverse_nested" QUERY TO PULL THESE self.start = start for i, v in enumerate(self.fields): exists = v.exists().partial_eval() nest = wrap({ "aggs": { "_match": { "filter": exists.to_es14_filter(self.schema), "aggs": { "_filter": set_default( { "terms": { "field": first( self.schema.leaves( v.var)).es_column, "size": self.domain.limit } }, es_query) } } } }) nest.aggs._missing = set_default( {"filter": NotOp("not", exists).to_es14_filter(self.schema)}, es_query) es_query = nest if self.domain.where: filter_ = self.domain.where.partial_eval().to_es14_filter( self.schema) es_query = { "aggs": { "_filter": set_default({"filter": filter_}, es_query) } } return es_query
def to_es14_filter(self, schema): if isinstance(self.lhs, Variable) and isinstance(self.rhs, Literal): columns = schema.values(self.lhs.var) if len(columns) == 0: return {"match_all": {}} elif len(columns) == 1: return es_not({"term": {first(columns).es_column: self.rhs.value}}) else: Log.error("column split to multiple, not handled") else: lhs = self.lhs.partial_eval().to_es14_script(schema) rhs = self.rhs.partial_eval().to_es14_script(schema) if lhs.many: if rhs.many: return es_not( ScriptOp( "script", ( "(" + lhs.expr + ").size()==(" + rhs.expr + ").size() && " + "(" + rhs.expr + ").containsAll(" + lhs.expr + ")" ) ).to_es14_filter(schema) ) else: return es_not( ScriptOp("script", "(" + lhs.expr + ").contains(" + rhs.expr + ")").to_es14_filter(schema) ) else: if rhs.many: return es_not( ScriptOp("script", "(" + rhs.expr + ").contains(" + lhs.expr + ")").to_es14_filter(schema) ) else: return es_not( ScriptOp("script", "(" + lhs.expr + ") != (" + rhs.expr + ")").to_es14_filter(schema) )
def es_aggsop(es, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY schema = frum.schema select = listwrap(query.select) es_query = Data() new_select = Data() # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": if schema.query_path == ".": s.pull = jx_expression_to_function("doc_count") else: s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]}) elif isinstance(s.value, Variable): if s.aggregate == "count": new_select["count_"+literal_field(s.value.var)] += [s] else: new_select[literal_field(s.value.var)] += [s] elif s.aggregate: formula.append(s) for canonical_name, many in new_select.items(): for s in many: columns = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] for column in columns: cn = literal_field(column.es_column + "_count") if column.jx_type == EXISTS: canonical_names.append(cn + ".doc_count") es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}} else: canonical_names.append(cn+ ".value") es_query.aggs[cn].value_count.field = column.es_column if len(canonical_names) == 1: s.pull = jx_expression_to_function(canonical_names[0]) else: s.pull = jx_expression_to_function({"add": canonical_names}) elif s.aggregate == "median": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = first(columns).es_column es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = first(columns).es_column es_query.aggs[key].percentiles.percents += [percent] es_query.aggs[key].percentiles.compression = 2 s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": canonical_names = [] for column in columns: cn = literal_field(column.es_column + "_cardinality") canonical_names.append(cn) es_query.aggs[cn].cardinality.field = column.es_column if len(columns) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0}) elif s.aggregate == "stats": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = first(columns).es_column # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") es_query.aggs[median_name].percentiles.field = first(columns).es_column es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": pulls = [] for column in columns: stats_name = encode_property(column.es_column) if column.nested_path[0] == ".": es_query.aggs[stats_name] = {"terms": { "field": column.es_column, "size": Math.min(s.limit, MAX_LIMIT) }} pulls.append(get_bucket_keys(stats_name)) else: es_query.aggs[stats_name] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": { "field": column.es_column, "size": Math.min(s.limit, MAX_LIMIT) }}} } pulls.append(get_bucket_keys(stats_name+"._nested")) if len(pulls) == 0: s.pull = NULL elif len(pulls) == 1: s.pull = pulls[0] else: s.pull = lambda row: UNION( p(row) for p in pulls ) else: if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = first(columns).es_column s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]}) for i, s in enumerate(formula): canonical_name = literal_field(s.name) if isinstance(s.value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es14_script(schema).script(schema) s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = s.value.to_es14_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = s.value.to_es14_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = s.value.to_es14_script(schema).script(schema) s.pull = jx_expression_to_function(key + ".value") elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = s.value.to_es14_script(schema).script(schema) # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = s.value.to_es14_script(schema).script(schema) es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate=="union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = s.value.to_es14_script(schema).script(schema) s.pull = jx_expression_to_function(stats_name + ".buckets.key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate]) es_query.aggs[canonical_name].extended_stats.script = s.value.to_es14_script(schema).script(schema) decoders = get_decoders_by_depth(query) start = 0 #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = AndOp("and", split_where[1]).to_es14_filter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( {"nested": {"path": schema.query_path[0]}}, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = AndOp("and", split_where[0]).to_es14_filter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time", silent=True) as es_duration: result = es_post(es, es_query, query.limit) try: format_time = Timer("formatting", silent=True) with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", cause=e)
def append_query(self, es_query, start): self.start = start if not isinstance(self.edge.value, Variable): if self.exists is TRUE: # IF True THEN WE DO NOT NEED THE _filter OR THE _missing (THIS RARELY HAPPENS THOUGH) output = wrap({ "aggs": { "_match": set_default( { "terms": { "script": self.script.expr, "size": self.domain.limit, "order": self.es_order } }, es_query) } }) else: output = wrap({ "aggs": { "_match": { # _match AND _filter REVERSED SO _match LINES UP WITH _missing "filter": self.exists.to_es14_filter(self.schema), "aggs": { "_filter": set_default( { "terms": { "script": self.script.expr, "size": self.domain.limit, "order": self.es_order } }, es_query) } }, "_missing": set_default( { "filter": self.missing.to_es14_filter( self.schema) }, es_query) } }) return output else: output = wrap({ "aggs": { "_match": set_default( { "terms": { "field": first(self.schema.leaves( self.edge.value.var)).es_column, "size": self.domain.limit, "order": self.es_order } }, es_query), "_missing": set_default( {"filter": self.missing.to_es14_filter(self.schema)}, es_query) } }) return output