def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) self.domain = edge.domain self.domain.limit = Math.min( coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list() self.key2index = {} self.computed_domain = False self.script = self.edge.value.partial_eval().to_es_script(self.schema) self.pull = pull_functions[self.script.data_type] self.missing = self.script.miss.partial_eval() self.exists = NotOp("not", self.missing).partial_eval() # WHEN SORT VALUE AND EDGE VALUE MATCHES, WE SORT BY TERM sort_candidates = [ s for s in self.query.sort if s.value == self.edge.value ] if sort_candidates: self.es_order = { "_term": { 1: "asc", -1: "desc" }[sort_candidates[0].sort] } else: self.es_order = None
def append_query(self, query_path, es_query): domain = self.domain domain_key = domain.key value = self.edge.value cnv = pull_functions[value.type] include = tuple(cnv(p[domain_key]) for p in domain.partitions) exists = AndOp("and", [ InOp("in", [value, Literal("literal", include)]) ]).partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if isinstance(value, Variable): es_field = first(self.query.frum.schema.leaves(value.var)).es_column # ALREADY CHECKED THERE IS ONLY ONE match = TermsAggs( "_match", { "field": es_field, "size": limit, "order": {"_term": self.sorted} if self.sorted else None }, self ) else: match = TermsAggs( "_match", { "script": { "lang": "painless", "inline": value.to_es_script(self.schema).script(self.schema) }, "size": limit }, self ) output = Aggs().add(FilterAggs("_filter", exists, None).add(match.add(es_query))) if self.edge.allowNulls: # FIND NULLS AT EACH NESTED LEVEL for p in self.schema.query_path: if p == query_path: # MISSING AT THE QUERY DEPTH output.add( NestedAggs(p).add(FilterAggs("_missing0", NotOp(None, exists), self).add(es_query)) ) else: # PARENT HAS NO CHILDREN, SO MISSING column = first(self.schema.values(query_path, (OBJECT, EXISTS))) output.add( NestedAggs(column.nested_path[0]).add( FilterAggs( "_missing1", NotOp(None, ExistsOp(None, Variable(column.es_column.replace(NESTED_TYPE, EXISTS_TYPE)))), self ).add(es_query) ) ) return output
def append_query(self, es_query, start): self.start = start parts = self.edge.domain.partitions filters = [] notty = [] for p in parts: w = p.where filters.append(AndOp("and", [w] + notty).to_esfilter(self.schema)) notty.append(NotOp("not", w)) missing_filter = None if self.edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER missing_filter = set_default( {"filter": AndOp("and", notty).to_esfilter(self.schema)}, es_query ) return wrap({"aggs": { "_match": set_default( {"filters": {"filters": filters}}, es_query ), "_missing": missing_filter }})
def _range_composer(edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if edge.allowNulls: missing_filter = set_default( { "filter": NotOp("not", AndOp("and", [ edge.value.exists(), InequalityOp("gte", [edge.value, Literal(None, to_float(_min))]), InequalityOp("lt", [edge.value, Literal(None, to_float(_max))]) ]).partial_eval()).to_esfilter(schema) }, es_query ) else: missing_filter = None if isinstance(edge.value, Variable): calc = {"field": schema.leaves(edge.value.var)[0].es_column} else: calc = {"script": edge.value.to_painless(schema).script(schema)} return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": missing_filter }})
def _range_composer(self, edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) output = Aggs() if edge.allowNulls: output.add( FilterAggs( "_missing", NotOp( AndOp([ edge.value.exists(), GteOp([edge.value, Literal(to_float(_min))]), LtOp([edge.value, Literal(to_float(_max))]) ]).partial_eval()), self).add(es_query)) if is_op(edge.value, Variable): calc = {"field": first(schema.leaves(edge.value.var)).es_column} else: calc = {"script": text_type(Painless[edge.value].to_es_script(schema))} calc['ranges'] = [{ "from": to_float(p.min), "to": to_float(p.max) } for p in domain.partitions] return output.add(RangeAggs("_match", calc, self).add(es_query))
def append_query(self, es_query, start): self.start = start domain = self.domain domain_key = domain.key include, text_include = zip(*( ( float(v) if isinstance(v, (int, float)) else v, text_type(float(v)) if isinstance(v, (int, float)) else v ) for v in (p[domain_key] for p in domain.partitions) )) value = self.edge.value exists = AndOp("and", [ value.exists(), InOp("in", [value, Literal("literal", include)]) ]).partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if isinstance(value, Variable): es_field = self.query.frum.schema.leaves(value.var)[0].es_column # ALREADY CHECKED THERE IS ONLY ONE terms = set_default({"terms": { "field": es_field, "size": limit, "order": {"_term": self.sorted} if self.sorted else None }}, es_query) else: terms = set_default({"terms": { "script": { "lang": "painless", "inline": value.to_painless(self.schema).script(self.schema) }, "size": limit }}, es_query) if self.edge.allowNulls: missing = set_default( {"filter": NotOp("not", exists).to_esfilter(self.schema)}, es_query ) else: missing = None return wrap({"aggs": { "_match": { "filter": exists.to_esfilter(self.schema), "aggs": { "_filter": terms } }, "_missing": missing }})
def append_query(self, query_path, es_query): parts = self.edge.domain.partitions filters = [] notty = [] for p in parts: w = p.where filters.append(AndOp([w] + notty)) notty.append(NotOp(w)) output = Aggs().add(FiltersAggs("_match", filters, self).add(es_query)) if self.edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER output.add(FilterAggs("_missing", AndOp(notty), self).add(es_query)) return output
def append_query(self, query_path, es_query): decoder = self for i, v in enumerate(self.fields): exists = v.exists().partial_eval() nest = Aggs() nest.add(TermsAggs("_match", { "field": first(self.schema.leaves(v.var)).es_column, "size": self.domain.limit }, decoder).add(es_query)) nest.add(FilterAggs("_missing", NotOp(exists), decoder).add(es_query)) es_query = nest decoder = None if self.domain.where: es_query = FilterAggs("_filter", self.domain.where, None).add(es_query) return es_query
def append_query(self, query_path, es_query): domain = self.domain domain_key = domain.key value = self.edge.value cnv = pull_functions[value.type] include = tuple(cnv(p[domain_key]) for p in domain.partitions) schema = self.schema exists = InOp([value, Literal(include)]).partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if is_op(value, Variable): es_field = first(schema.leaves(value.var)).es_column # ALREADY CHECKED THERE IS ONLY ONE match = TermsAggs( "_match", { "field": es_field, "size": limit, "order": {"_term": self.sorted} if self.sorted else None }, self ) else: match = TermsAggs( "_match", { "script": text(Painless[value].to_es_script(schema)), "size": limit }, self ) output = Aggs().add(FilterAggs("_filter", exists, None).add(match.add(es_query))) if self.edge.allowNulls: # IF ALL NESTED COLUMNS ARE NULL, DOES THE FILTER PASS? # MISSING AT THE QUERY DEPTH # columns = schema[value.var] concat_inner = split_expression(NotOp(exists), self.query) for i, term in enumerate(concat_inner.terms): acc = es_query for nest in term.nests: if nest.where is not TRUE: acc = NestedAggs(nest.path.var).add(FilterAggs("_missing" + text(i), nest.where, self).add(acc)) output.add(acc) return output
def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) self.domain = edge.domain self.domain.limit = Math.min(coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list() self.key2index = {} self.computed_domain = False self.script = self.edge.value.partial_eval().to_es_script(self.schema) self.pull = pull_functions[self.script.data_type] self.missing = self.script.miss.partial_eval() self.exists = NotOp("not", self.missing).partial_eval() # WHEN SORT VALUE AND EDGE VALUE MATCHES, WE SORT BY TERM sort_candidates = [s for s in self.query.sort if s.value == self.edge.value] if sort_candidates: self.es_order = {"_term": {1: "asc", -1: "desc"}[sort_candidates[0].sort]} else: self.es_order = None
def append_query(self, query_path, es_query): domain = self.domain domain_key = domain.key value = Painless[self.edge.value] cnv = pull_functions[value.type] include = tuple(cnv(p[domain_key]) for p in domain.partitions) schema = self.schema exists = Painless[AndOp([InOp([value, Literal(include)])])].partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if is_op(value, Variable): es_field = first(schema.leaves( value.var)).es_column # ALREADY CHECKED THERE IS ONLY ONE match = TermsAggs( "_match", { "field": es_field, "size": limit, "order": { "_term": self.sorted } if self.sorted else None }, self) else: match = TermsAggs("_match", { "script": text(value.to_es_script(schema)), "size": limit }, self) output = Aggs().add( FilterAggs("_filter", exists, None).add(match.add(es_query))) if self.edge.allowNulls: # IF ALL NESTED COLUMNS ARE NULL, DOES THE FILTER PASS? # MISSING AT THE QUERY DEPTH op, split = split_expression_by_path(NotOp(exists), schema) for i, p in enumerate(reversed(sorted(split.keys()))): e = split.get(p) if e: not_match = NestedAggs(p).add( FilterAggs("_missing" + text(i), e, self).add(es_query)) output.add(not_match) return output
def append_query(self, es_query, start): # TODO: USE "reverse_nested" QUERY TO PULL THESE self.start = start for i, v in enumerate(self.fields): exists = v.exists().partial_eval() nest = wrap({ "aggs": { "_match": { "filter": exists.to_esfilter(self.schema), "aggs": { "_filter": set_default( { "terms": { "field": self.schema.leaves( v.var)[0].es_column, "size": self.domain.limit } }, es_query) } } } }) if self.edge.allowNulls: nest.aggs._missing = set_default( {"filter": NotOp("not", exists).to_esfilter(self.schema)}, es_query) es_query = nest if self.domain.where: filter_ = self.domain.where.partial_eval().to_esfilter(self.schema) es_query = { "aggs": { "_filter": set_default({"filter": filter_}, es_query) } } return es_query
def _range_composer(self, edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) output = Aggs() if edge.allowNulls: output.add(FilterAggs( "_missing", NotOp("not", AndOp("and", [ edge.value.exists(), InequalityOp("gte", [edge.value, Literal(None, to_float(_min))]), InequalityOp("lt", [edge.value, Literal(None, to_float(_max))]) ]).partial_eval()), self ).add(es_query)) if isinstance(edge.value, Variable): calc = {"field": first(schema.leaves(edge.value.var)).es_column} else: calc = {"script": edge.value.to_es_script(schema).script(schema)} calc['ranges'] = [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions] return output.add(RangeAggs("_match", calc, self).add(es_query))
def append_query(self, es_query, start): self.start = start value = self.edge.value.partial_eval() script = value.to_painless(self.schema) exists = NotOp("not", script.miss).partial_eval() if not isinstance(self.edge.value, Variable): output = wrap({"aggs": { "_match": { "filter": exists.to_esfilter(self.schema), "aggs": { "_filter": set_default( {"terms": { "script": { "lang": "painless", "inline": script.expr }, "size": self.domain.limit, "order": {"_term": self.sorted} if self.sorted else None }}, es_query ) } }, "_missing": set_default( {"filter": NotOp("not", exists).to_esfilter(self.schema)}, es_query ) }}) return output elif self.edge.value.var in [s.value.var for s in self.query.sort]: sort_dir = [s.sort for s in self.query.sort if s.value.var == self.edge.value.var][0] output = wrap({"aggs": { "_match": set_default( {"terms": { "field": self.schema.leaves(self.edge.value.var)[0].es_column, "size": self.domain.limit, "order": {"_term": "asc" if sort_dir == 1 else "desc"} }}, es_query ), "_missing": set_default( {"filter": NotOp("not", exists).to_esfilter(self.schema)}, es_query ) }}) return output else: output = wrap({"aggs": { "_match": set_default( {"terms": { "field": self.schema.leaves(self.edge.value.var)[0].es_column, "size": self.domain.limit }}, es_query ), "_missing": set_default( {"filter": NotOp("not", exists).to_esfilter(self.schema)}, es_query ) }}) return output
class DefaultDecoder(SetDecoder): # FOR DECODING THE default DOMAIN TYPE (UNKNOWN-AT-QUERY-TIME SET OF VALUES) def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) self.domain = edge.domain self.domain.limit = Math.min( coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list() self.key2index = {} self.computed_domain = False self.script = self.edge.value.partial_eval().to_es_script(self.schema) self.pull = pull_functions[self.script.data_type] self.missing = self.script.miss.partial_eval() self.exists = NotOp("not", self.missing).partial_eval() # WHEN SORT VALUE AND EDGE VALUE MATCHES, WE SORT BY TERM sort_candidates = [ s for s in self.query.sort if s.value == self.edge.value ] if sort_candidates: self.es_order = { "_term": { 1: "asc", -1: "desc" }[sort_candidates[0].sort] } else: self.es_order = None def append_query(self, es_query, start): self.start = start if not isinstance(self.edge.value, Variable): if self.exists is TRUE: # IF True THEN WE DO NOT NEED THE _filter OR THE _missing (THIS RARELY HAPPENS THOUGH) output = wrap({ "aggs": { "_match": set_default( { "terms": { "script": { "lang": "painless", "inline": self.script.expr }, "size": self.domain.limit, "order": self.es_order } }, es_query) } }) else: output = wrap({ "aggs": { "_match": { # _match AND _filter REVERSED SO _match LINES UP WITH _missing "filter": self.exists.to_esfilter(self.schema), "aggs": { "_filter": set_default( { "terms": { "script": { "lang": "painless", "inline": self.script.expr }, "size": self.domain.limit, "order": self.es_order } }, es_query) } }, "_missing": set_default( {"filter": self.missing.to_esfilter(self.schema)}, es_query) } }) return output else: output = wrap({ "aggs": { "_match": set_default( { "terms": { "field": self.schema.leaves( self.edge.value.var)[0].es_column, "size": self.domain.limit, "order": self.es_order } }, es_query), "_missing": set_default( {"filter": self.missing.to_esfilter(self.schema)}, es_query) } }) return output def count(self, row): part = row[self.start] if part['doc_count']: if part.get('key') != None: self.parts.append(self.pull(part.get('key'))) else: self.edge.allowNulls = True # OK! WE WILL ALLOW NULLS def done_count(self): self.edge.domain = self.domain = SimpleSetDomain( partitions=jx.sort(set(self.parts))) self.parts = None self.computed_domain = True def get_index(self, row): if self.computed_domain: try: part = row[self.start] return self.domain.getIndexByKey(self.pull(part.get('key'))) except Exception as e: Log.error("problem", cause=e) else: try: part = row[self.start] key = self.pull(part.get('key')) i = self.key2index.get(key) if i is None: i = len(self.parts) part = {"key": key, "dataIndex": i} self.parts.append(part) self.key2index[key] = i return i except Exception as e: Log.error("problem", cause=e) @property def num_columns(self): return 1
class DefaultDecoder(SetDecoder): # FOR DECODING THE default DOMAIN TYPE (UNKNOWN-AT-QUERY-TIME SET OF VALUES) def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) self.domain = edge.domain self.domain.limit = Math.min(coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list() self.key2index = {} self.computed_domain = False self.script = self.edge.value.partial_eval().to_es_script(self.schema) self.pull = pull_functions[self.script.data_type] self.missing = self.script.miss.partial_eval() self.exists = NotOp("not", self.missing).partial_eval() # WHEN SORT VALUE AND EDGE VALUE MATCHES, WE SORT BY TERM sort_candidates = [s for s in self.query.sort if s.value == self.edge.value] if sort_candidates: self.es_order = {"_term": {1: "asc", -1: "desc"}[sort_candidates[0].sort]} else: self.es_order = None def append_query(self, es_query, start): self.start = start if not isinstance(self.edge.value, Variable): if self.exists is TRUE: # IF True THEN WE DO NOT NEED THE _filter OR THE _missing (THIS RARELY HAPPENS THOUGH) output = wrap({"aggs": { "_match": set_default( {"terms": { "script": {"lang": "painless", "inline": self.script.expr}, "size": self.domain.limit, "order": self.es_order }}, es_query ) }}) else: output = wrap({"aggs": { "_match": { # _match AND _filter REVERSED SO _match LINES UP WITH _missing "filter": self.exists.to_esfilter(self.schema), "aggs": { "_filter": set_default( {"terms": { "script": {"lang": "painless", "inline": self.script.expr}, "size": self.domain.limit, "order": self.es_order }}, es_query ) } }, "_missing": set_default( {"filter": self.missing.to_esfilter(self.schema)}, es_query ) }}) return output else: output = wrap({"aggs": { "_match": set_default( {"terms": { "field": self.schema.leaves(self.edge.value.var)[0].es_column, "size": self.domain.limit, "order": self.es_order }}, es_query ), "_missing": set_default( {"filter": self.missing.to_esfilter(self.schema)}, es_query ) }}) return output def count(self, row): part = row[self.start] if part['doc_count']: if part.get('key') != None: self.parts.append(self.pull(part.get('key'))) else: self.edge.allowNulls = True # OK! WE WILL ALLOW NULLS def done_count(self): self.edge.domain = self.domain = SimpleSetDomain( partitions=jx.sort(set(self.parts)) ) self.parts = None self.computed_domain = True def get_index(self, row): if self.computed_domain: try: part = row[self.start] return self.domain.getIndexByKey(self.pull(part.get('key'))) except Exception as e: Log.error("problem", cause=e) else: try: part = row[self.start] key = self.pull(part.get('key')) i = self.key2index.get(key) if i is None: i = len(self.parts) part = {"key": key, "dataIndex": i} self.parts.append(part) self.key2index[key] = i return i except Exception as e: Log.error("problem", cause=e) @property def num_columns(self): return 1