def _normalize_select_no_context(select, schema=None): """ SAME NORMALIZE, BUT NO SOURCE OF COLUMNS """ if not _Column: _late_import() if is_text(select): select = Data(value=select) else: select = wrap(select) output = select.copy() if not select.value: output.name = coalesce(select.name, select.aggregate) if output.name: output.value = jx_expression(".", schema=schema) elif len(select): Log.error(BAD_SELECT, select=select) else: return Null elif is_text(select.value): if select.value.endswith(".*"): name = select.value[:-2].lstrip(".") output.name = coalesce(select.name, name) output.value = LeavesOp(Variable(name), prefix=coalesce(select.prefix, name)) else: if select.value == ".": output.name = coalesce(select.name, select.aggregate, ".") output.value = jx_expression(select.value, schema=schema) elif select.value == "*": output.name = coalesce(select.name, select.aggregate, ".") output.value = LeavesOp(Variable(".")) else: output.name = coalesce(select.name, select.value.lstrip("."), select.aggregate) output.value = jx_expression(select.value, schema=schema) elif is_number(output.value): if not output.name: output.name = text(output.value) output.value = jx_expression(select.value, schema=schema) else: output.value = jx_expression(select.value, schema=schema) if not output.name: Log.error("expecting select to have a name: {{select}}", select=select) if output.name.endswith(".*"): Log.error("{{name|quote}} is invalid select", name=output.name) output.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") output.default = coalesce(select.default, canonical_aggregates[output.aggregate].default) return output
def _normalize_select_no_context(select, schema=None): """ SAME NORMALIZE, BUT NO SOURCE OF COLUMNS """ if not _Column: _late_import() if isinstance(select, basestring): select = Data(value=select) else: select = wrap(select) output = select.copy() if not select.value: output.name = coalesce(select.name, select.aggregate) if output.name: output.value = jx_expression(".") else: return output elif isinstance(select.value, basestring): if select.value.endswith(".*"): output.name = coalesce(select.name, select.value[:-2], select.aggregate) output.value = LeavesOp("leaves", Variable(select.value[:-2])) else: if select.value == ".": output.name = coalesce(select.name, select.aggregate, ".") output.value = jx_expression(select.value) elif select.value == "*": output.name = coalesce(select.name, select.aggregate, ".") output.value = LeavesOp("leaves", Variable(".")) else: output.name = coalesce(select.name, select.value, select.aggregate) output.value = jx_expression(select.value) elif isinstance(select.value, (int, float)): if not output.name: output.name = text_type(select.value) output.value = jx_expression(select.value) else: output.value = jx_expression(select.value) if not output.name: Log.error("expecting select to have a name: {{select}}", select=select) if output.name.endswith(".*"): Log.error("{{name|quote}} is invalid select", name=output.name) output.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") output.default = coalesce(select.default, canonical_aggregates[output.aggregate].default) return output
def query_to_outer_joins(query, all_paths, split_select, var_to_columns): """ CONVERT FROM JSON QUERY EXPRESSION TO A NUMBER OF OUTER JOINS :param frum: :param expr: :param all_paths: :param var_to_columns: :return: """ frum = query.frum where = query.where query_path = frum.schema.query_path[0] # MAP TO es_columns, INCLUDE NESTED EXISTENCE IN EACH VARIABLE wheres = split_nested_inner_variables(where, query_path, var_to_columns) concat_outer_and = _split_expression(wheres, frum.schema, all_paths) # ATTACH SELECTS output = [] for concat in concat_outer_and: nests = [] for p, nest in zip(all_paths, concat): select = coalesce(split_select.get(p), NULL) nests.append( NestedOp(Variable(p), select=select, where=AndOp(nest))) outer = OuterJoinOp(frum, nests).partial_eval() if outer is not NULL: output.append(outer) return ConcatOp(output)
def to_ruby(self, schema): if self.var == ".": return "_source" else: if self.var == "_id": return Ruby( type=STRING, expr= 'doc["_uid"].value.substring(doc["_uid"].value.indexOf(\'#\')+1)', frum=self) columns = schema.values(self.var) acc = [] for c in columns: varname = c.es_column frum = Variable(c.es_column) q = quote(varname) acc.append( Ruby(miss=frum.missing(), type=c.type, expr="doc[" + q + "].values", frum=frum, many=True)) if len(acc) == 0: return NULL.to_ruby(schema) elif len(acc) == 1: return acc[0] else: return CoalesceOp("coalesce", acc).to_ruby(schema)
def split_nested_inner_variables(where, focal_path, var_to_columns): """ SOME VARIABLES ARE BOTH NESTED AND INNER, EXPAND QUERY TO HANDLE BOTH :param where: :param focal_path: :param var_to_columns: :return: """ wheres = [where] # WE DO THIS EXPANSION TO CAPTURE A VARIABLE OVER DIFFERENT NESTED LEVELS # EXPAND VARS TO COLUMNS, MULTIPLY THE EXPRESSIONS for v, cols in var_to_columns.items(): more_exprs = [] if not cols: for e in wheres: more_exprs.append(e.map({v: NULL})) else: for c in cols: deepest = c.nested_path[0] for e in wheres: if startswith_field(focal_path, deepest): more_exprs.append( e.map({ v: Variable(c.es_column, type=c.jx_type, multi=c.multi) })) else: more_exprs.append( e.map({ v: NestedOp( path=Variable(deepest), select=Variable(c.es_column), where=Variable(c.es_column).exists(), ) })) wheres = more_exprs var_to_columns = { c.es_column: [c] for cs in var_to_columns.values() for c in cs } return OrOp(wheres)
def outer_to_inner(expr, paths_to_cols): # JSON QUERY EXPRESSIONS ASSUME OUTER JOIN # ES ONLY HAS INNER JOIN # ACCOUNT FOR WHEN NESTED RECORDS ARE MISSING if expr is NULL: return NULL elif is_op(expr, ConcatOp): output = [] for outer in expr.terms: for inner in outer_to_inner(outer, paths_to_cols).terms: output.append(inner) return ConcatOp(output) elif is_op(expr, OuterJoinOp): # THE MAIN INNER JOIN output = [InnerJoinOp(expr.frum, expr.nests)] # ALL THE OUTER JOIN RESIDUES for deepest in expr.nests[:-1]: # LAST '.' NOT NEEDED deepest_path = deepest.path.var inner_join = InnerJoinOp(expr.frum, []) deeper_conditions = TRUE for nest in expr.nests: nest_path = nest.path.var if len(nest_path) < len(deepest_path): new_nest = NestedOp( path=nest.path, select=nest.select, where=AndOp([deeper_conditions, nest.where]), sort=nest.sort, limit=nest.limit, ) inner_join.nests.append(new_nest) deeper_conditions = TRUE elif nest_path == deepest_path: # assume the deeper is null set_null = { d.es_column: NULL for d in paths_to_cols[deepest_path] } set_null[deepest_path] = NULL deeper_exists = nest.where.map(set_null).partial_eval() if deeper_exists is FALSE: # WHERE CAN NOT BE SATISFIED IF NESTED IS NULL deeper_conditions = FALSE else: # ENSURE THIS IS NOT "OPTIMIZED" TO FALSE deeper_conditions = NotOp( NestedOp(path=Variable(nest_path), where=TRUE, select=NULL)) deeper_conditions.simplified = True inner_join = inner_join.partial_eval() if inner_join.missing() is not TRUE: output.append(inner_join) return ConcatOp(output) else: Log.error("do not know what to do yet")
def get_decoders_by_path(query, schema): """ RETURN MAP FROM QUERY PATH TO LIST OF DECODER ARRAYS :param query: :return: """ output = {} if query.edges: if query.sort and query.format != "cube": # REORDER EDGES/GROUPBY TO MATCH THE SORT query.edges = sort_edges(query, "edges") elif query.groupby: if query.sort and query.format != "cube": query.groupby = sort_edges(query, "groupby") for edge in to_data(coalesce(query.edges, query.groupby, [])): limit = coalesce(edge.domain.limit, query.limit, DEFAULT_LIMIT) vars_ = coalesce(edge.value.vars(), set()) if edge.range: vars_ |= edge.range.min.vars() | edge.range.max.vars() for v in vars_: if not schema[v.var]: Log.error("{{var}} does not exist in schema", var=v) elif edge.domain.dimension: vars_ |= set(Variable(v) for v in edge.domain.dimension.fields) edge.domain.dimension = edge.domain.dimension.copy() edge.domain.dimension.fields = [ schema[v.var].es_column for v in vars_ ] elif edge.domain.partitions.where and all( edge.domain.partitions.where): for p in edge.domain.partitions: vars_ |= p.where.vars() else: # SIMPLE edge.value decoder = AggsDecoder(edge, query, limit) depths = set(c.nested_path[0] for v in vars_ for c in schema.leaves(v.var)) output.setdefault(first(depths), []).append(decoder) continue depths = set(c.nested_path[0] for v in vars_ for c in schema.leaves(v.var)) if not depths: Log.error("Do not know of column {{column}}", column=unwraplist( [v for v in vars_ if schema[v.var] == None])) if len(depths) > 1: Log.error("expression {{expr|quote}} spans tables, can not handle", expr=edge.value) decoder = AggsDecoder(edge, query, limit) output.setdefault(first(depths), []).append(decoder) return output
def append_query(self, query_path, es_query): domain = self.domain domain_key = domain.key value = Painless[self.edge.value] cnv = pull_functions[value.type] include = tuple(cnv(p[domain_key]) for p in domain.partitions) exists = Painless[AndOp([InOp([value, Literal(include)])])].partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if is_op(value, Variable): es_field = first(self.query.frum.schema.leaves( value.var)).es_column # ALREADY CHECKED THERE IS ONLY ONE match = TermsAggs( "_match", { "field": es_field, "size": limit, "order": { "_term": self.sorted } if self.sorted else None }, self) else: match = TermsAggs( "_match", { "script": text_type(value.to_es_script(self.schema)), "size": limit }, self) output = Aggs().add( FilterAggs("_filter", exists, None).add(match.add(es_query))) if self.edge.allowNulls: # FIND NULLS AT EACH NESTED LEVEL for p in self.schema.query_path: if p == query_path: # MISSING AT THE QUERY DEPTH output.add( NestedAggs(p).add( FilterAggs("_missing0", NotOp(exists), self).add(es_query))) else: # PARENT HAS NO CHILDREN, SO MISSING column = first( self.schema.values(query_path, (OBJECT, EXISTS))) output.add( NestedAggs(column.nested_path[0]).add( FilterAggs( "_missing1", NotOp( ExistsOp( Variable( column.es_column.replace( NESTED_TYPE, EXISTS_TYPE)))), self).add(es_query))) return output
def to_sql(self, schema, not_null=False, boolean=False): if not isinstance(self.term, Variable): Log.error("Can only handle Variable") term = self.term.var prefix_length = len(split_field(term)) return wrap([{ "name": join_field(split_field(schema.get_column_name(c))[prefix_length:]), "sql": Variable(schema.get_column_name(c)).to_sql(schema)[0].sql } for n, cols in schema.map_to_sql(term).items() for c in cols])
def _normalize_group(edge, dim_index, limit, schema=None): """ :param edge: Not normalized groupby :param dim_index: Dimensions are ordered; this is this groupby's index into that order :param schema: for context :return: a normalized groupby """ if is_text(edge): if edge.endswith(".*"): prefix = edge[:-2] if schema: output = wrap([ { # BECASUE THIS IS A GROUPBY, EARLY SPLIT INTO LEAVES WORKS JUST FINE "name": concat_field(prefix, literal_field(relative_field(untype_path(c.name), prefix))), "put": {"name": literal_field(untype_path(c.name))}, "value": jx_expression(c.es_column, schema=schema), "allowNulls": True, "domain": {"type": "default"} } for c in schema.leaves(prefix) ]) return output else: return wrap([{ "name": untype_path(prefix), "put": {"name": literal_field(untype_path(prefix))}, "value": LeavesOp(Variable(prefix)), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }]) return wrap([{ "name": edge, "value": jx_expression(edge, schema=schema), "allowNulls": True, "dim": dim_index, "domain": Domain(type="default", limit=limit) }]) else: edge = wrap(edge) if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None: Log.error("groupby does not accept complicated domains") if not edge.name and not is_text(edge.value): Log.error("You must name compound edges: {{edge}}", edge= edge) return wrap([{ "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value, schema=schema), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }])
def append_query(self, query_path, es_query): decoder = self for i, v in enumerate(self.fields): nest = Aggs().add( TermsAggs("_match", { "field": v, "size": self.domain.limit }, decoder).add(es_query)).add( FilterAggs("_missing", MissingOp(Variable(v)), decoder).add(es_query)) es_query = nest decoder = None return es_query
def to_sql(self, schema, not_null=False, boolean=False): if not isinstance(self.term, Variable): Log.error("Can only handle Variable") term = self.term.var prefix_length = len(split_field(term)) db_columns = [] for n, cols in schema.map_to_sql(term).items(): for c in cols: col = schema.get_column_name(c) if startswith_field(col, term): db_columns.append({ "name": join_field(split_field(col)[prefix_length:]), "sql": Variable(col).to_sql(schema)[0].sql }) else: db_columns.append({ "name": col, "sql": Variable(col).to_sql(schema)[0].sql }) return wrap(db_columns)
def to_sql(self, schema, not_null=False, boolean=False): if not is_op(self.term, Variable): Log.error("Can only handle Variable") term = self.term.var prefix_length = len(split_field(term)) output = wrap([{ "name": join_field(split_field(schema.get_column_name(c))[prefix_length:]), "sql": Variable(schema.get_column_name(c)).to_sql(schema)[0].sql } for c in schema.columns if startswith_field(c.name, term) and ( (c.jx_type not in (EXISTS, OBJECT, NESTED) and startswith_field( schema.nested_path[0], c.nested_path[0])) or (c.jx_type not in ( EXISTS, OBJECT) and schema.nested_path[0] == c.nested_path[0])) ]) return output
def es_query_proto(selects, op, wheres, schema): """ RETURN AN ES QUERY :param selects: MAP FROM path TO ESSelect INSTANCE :param wheres: MAP FROM path TO LIST OF WHERE CONDITIONS :return: es_query """ es_query = op.zero for p in reversed(sorted(set(wheres.keys()) | set(selects.keys()))): # DEEPEST TO SHALLOW where = wheres.get(p, TRUE) select = selects.get(p, Null) es_where = op([es_query, where]) es_query = NestedOp(path=Variable(p), query=es_where, select=select) return es_query.partial_eval().to_es(schema)
def _normalize_select(self, select): output = [] if select.value == ".": for cname, cs in self.columns.items(): for c in cs: if c.type in STRUCT: continue new_select = select.copy() new_select.name = cname new_select.value = Variable(cname) output.append(new_select) break elif select.value.endswith(".*"): Log.error("not done") else: Log.error("not done") return output
def to_es_script(self, schema, many=True): if self.var == ".": return "_source" else: if self.var == "_id": return EsScript( type=STRING, expr= 'doc["_uid"].value.substring(doc["_uid"].value.indexOf(\'#\')+1)', frum=self) columns = schema.values(self.var) acc = [] for c in columns: varname = c.es_column frum = Variable(c.es_column) q = quote(varname) if many: acc.append( EsScript(miss=frum.missing(), type=c.jx_type, expr="doc[" + q + "].values" if c.jx_type != BOOLEAN else "doc[" + q + "].value==\"T\"", frum=frum, many=True)) else: acc.append( EsScript(miss=frum.missing(), type=c.jx_type, expr="doc[" + q + "].value" if c.jx_type != BOOLEAN else "doc[" + q + "].value==\"T\"", frum=frum, many=True)) if len(acc) == 0: return NULL.to_es_script(schema) elif len(acc) == 1: return acc[0] else: return CoalesceOp("coalesce", acc).to_es_script(schema)
def es_setop(es, query): schema = query.frum.schema query_path = schema.query_path[0] split_select = {".": ESSelect('.')} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelect(path) return es_select selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field( select.name, relative_field(untype_path(c.name), term.var)) if c.jx_type == NESTED: get_select('.').use_source = True new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": "." }, "pull": get_pull_source(c.es_column) }) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": "." } }) put_index += 1 elif is_op(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select('.').use_source = True new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": "." }, "pull": get_pull_source(".") }) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select('.').use_source = True for c in leaves: if len( c.nested_path ) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field( decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column)) }, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": "." }, "pull": lambda row: row._id }) elif c.jx_type == NESTED: get_select('.').use_source = True pre_child = join_field( decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field( pre_child, s_column)) }, "pull": get_pull_source(c.es_column) }) else: get_select(c_nested_path).fields.append( c.es_column) pre_child = join_field( decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field( pre_child, s_column)) } }) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field( untype_path( relative_field(c.name, schema.query_path[0])), s_column) pull = accumulate_nested_doc( c_nested_path, Variable( relative_field( s_column, unnest_path(c_nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": { "name": select.name, "index": put_index, "child": "." } }) put_index += 1 else: split_scripts = split_expression_by_path(select.value, schema, lang=Painless) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = { "script": text_type(Painless[first( script)].partial_eval().to_es_script(schema)) } new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": { "name": select.name, "index": put_index, "child": "." } }) put_index += 1 for n in new_select: if n.pull: continue elif is_op(n.value, Variable): if get_select('.').use_source: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function( concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") split_wheres = split_expression_by_path(query.where, schema, lang=ES52) es_query = es_query_proto(query_path, split_select, split_wheres, schema) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort, schema) with Timer("call to ES", silent=DEBUG) as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits # Log.note("{{output}}", output=T) try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter", silent=True): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def split_expression_by_path(expr, schema, lang=Language): """ :param expr: EXPRESSION TO INSPECT :param schema: THE SCHEMA :param output: THE MAP FROM PATH TO EXPRESSION WE WANT UPDATED :param var_to_columns: MAP FROM EACH VARIABLE NAME TO THE DEPTH :return: type, output: (OP, MAP) PAIR WHERE OP IS OPERATOR TO APPLY ON MAP ITEMS, AND MAP FROM PATH TO EXPRESSION """ if is_op(expr, AndOp): if not expr.terms: return AndOp, {".": TRUE} elif len(expr.terms) == 1: return split_expression_by_path(expr.terms[0], schema, lang=lang) output = {} curr_op = AndOp for w in expr.terms: op, split = split_expression_by_path(w, schema, lang=lang) if op == AndOp: for v, es in split.items(): ae = output.get(v) if not ae: output[v] = ae = AndOp([]) ae.terms.append(es) elif len(output) == 1 and all(c.jx_type == EXISTS for v in split["."].vars() for c in schema.values(v.var)): for v, es in split.items(): if v == ".": continue ae = output.get(v) if not ae: output[v] = ae = AndOp([]) ae.terms.append(es) else: Log.error("confused") return curr_op, output expr_vars = expr.vars() var_to_columns = {v.var: schema.values(v.var) for v in expr_vars} all_paths = set(c.nested_path[0] for v in expr_vars for c in var_to_columns[v.var]) def add(v, c): cols = var_to_columns.get(v) if not cols: var_to_columns[v] = cols = [] if c not in cols: cols.append(c) # all_paths MAY BE MISSING SHALLOW PATHS exprs = [expr] undo = {} for p in schema.query_path: # CALCULATE THE RESIDUAL EXPRESSION # REPLACE EACH DEEPER VAR WITH null # TODO: NOT ACCOUNTING FOR DEEP QUERIES ON SHALLOW TABLE mapping = { v: c for v, cols in var_to_columns.items() for c in cols if len(c.nested_path[0]) > len(p) } if mapping: acc = [] for v, col in mapping.items(): nested_exists = join_field( split_field(col.nested_path[0])[:-1] + [EXISTS_TYPE]) e = schema.values(nested_exists) if not e: Log.error("do not know how to handle") add(nested_exists, first(e)) # REGISTER THE EXISTENCE VARIABLE acc.append(MissingOp(Variable(nested_exists))) acc.append(expr.map({v: NULL for v in mapping.keys()})) with_nulls = AndOp(acc).partial_eval() if with_nulls is not FALSE: all_paths.add(p) exprs.append(with_nulls) if len(all_paths) == 0: return AndOp, {".": expr} # CONSTANTS elif len(all_paths) == 1: return AndOp, {first(all_paths): expr} # EXPAND EXPRESSION TO ALL REALIZED COLUMNS for v, cols in list(var_to_columns.items()): for col in cols: add(col.es_column, col) if len(cols) <= 1: continue more_expr = [] for e in exprs: for col in cols: more_expr.append(e.map({v: col.es_column})) exprs = more_expr acc = {} for e in exprs: nestings = list( set(c.nested_path[0] for v in e.vars() for c in var_to_columns[v])) if not nestings: a = acc.get(".") if not a: acc["."] = a = OrOp([]) a.terms.append(e) elif len(nestings) == 1: a = acc.get(nestings[0]) if not a: acc[nestings[0]] = a = OrOp([]) a.terms.append(e) else: Log.error("Expression is too complex") if undo: return OrOp, {k: v.map(undo) for k, v in acc.items()} else: return OrOp, acc
def query_to_outer_joins(query, all_paths, split_select, var_to_columns): """ CONVERT FROM JSON QUERY EXPRESSION TO A NUMBER OF OUTER JOINS :param frum: :param expr: :param all_paths: :param var_to_columns: :return: """ def split(expr): """ :param expr: JSON EXPRESSION :return: ARRAY INDEX BY (CONCAT, OUTER JOIN, AND) """ expr = expr.partial_eval() if is_op(expr, AndOp): acc = [tuple([] for _ in all_paths)] for t in expr.terms: next = [] for c in split(t): for a in acc: next.append(tuple(n + an for n, an in zip(c, a))) acc = next return acc elif is_op(expr, OrOp): output = [] exclude = [] for t in expr.terms: for c in split(AndOp([AndOp(exclude), t])): output.append(c) exclude.append(NotOp(t)) return output all_nests = list( set(c.nested_path[0] for v in expr.vars() for c in frum.schema.values(v.var))) if len(all_nests) > 1: Log.error("do not know how to handle") elif not all_nests: return [tuple([expr] if p == "." else [] for p in all_paths)] else: return [ tuple([expr] if p == all_nests[0] else [] for p in all_paths) ] frum = query.frum where = query.where focal_path = frum.schema.query_path[0] # MAP TO es_columns, INCLUDE NESTED EXISTENCE IN EACH VARIABLE wheres = split_nested_inner_variables(where, focal_path, var_to_columns) concat_outer_and = split(wheres) # ATTACH SELECTS output = ConcatOp([]) for concat in concat_outer_and: outer = OuterJoinOp(frum, []) for p, nest in zip(all_paths, concat): select = coalesce(split_select.get(p), NULL) outer.nests.append( NestedOp(Variable(p), select=select, where=AndOp(nest))) output.terms.append(outer) return output.partial_eval()
def get_selects(query): schema = query.frum.schema split_select = {".": ESSelect(".")} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelect(path) return es_select selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field( select.name, relative_field(untype_path(c.name), term.var) ) if c.jx_type == NESTED: get_select(".").set_op = True new_select.append( { "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": ".", }, "pull": get_pull_source(c.es_column), } ) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append( { "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": ".", }, } ) put_index += 1 elif is_op(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select(".").set_op = True new_select.append( { "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source("."), } ) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select(".").set_op = True for c in leaves: if ( len(c.nested_path) == 1 ): # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field( decode_property(n) for n in split_field(c.name) ) new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column) ), }, "pull": get_pull_source(c.es_column), } ) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": ".", }, "pull": lambda row: row._id, } ) elif c.jx_type == NESTED: get_select(".").set_op = True pre_child = join_field( decode_property(n) for n in split_field(c.name) ) new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column) ), }, "pull": get_pull_source(c.es_column), } ) else: get_select(c_nested_path).fields.append(c.es_column) pre_child = join_field( decode_property(n) for n in split_field(c.name) ) new_select.append( { "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column) ), }, } ) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field( untype_path( relative_field(c.name, schema.query_path[0]) ), s_column, ) pull = accumulate_nested_doc( c_nested_path, Variable( relative_field(s_column, unnest_path(c_nested_path)) ), ) new_select.append( { "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child, }, "pull": pull, } ) else: new_select.append( { "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."}, } ) put_index += 1 else: split_scripts = split_expression_by_path( select.value, schema, lang=Painless ) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = { "script": text( Painless[first(script)].partial_eval().to_es_script(schema) ) } new_select.append( { "name": select.name, "pull": jx_expression_to_function( "fields." + literal_field(select.name) ), "put": {"name": select.name, "index": put_index, "child": "."}, } ) put_index += 1 for n in new_select: if n.pull: continue elif is_op(n.value, Variable): if get_select(".").set_op: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function( concat_field("fields", literal_field(n.value.var)) ) else: Log.error("Do not know what to do") return new_select, split_select
def get_selects(query): schema = query.frum.schema query_level = len(schema.query_path) query_path = schema.query_path[0] # SPLIT select INTO ES_SELECT AND RESULTSET SELECT split_select = OrderedDict((p, ESSelectOp(p)) for p in schema.query_path) def expand_split_select(c_nested_path): es_select = split_select.get(c_nested_path) if not es_select: temp = [(k, v) for k, v in split_select.items()] split_select.clear() split_select.update({c_nested_path: ESSelectOp(c_nested_path)}) split_select.update(temp) return split_select[c_nested_path] new_select = FlatList() post_expressions = {} selects = list_to_data([unwrap(s.copy()) for s in listwrap(query.select)]) # WHAT PATH IS _source USED, IF ANY? for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) if any(c.jx_type == NESTED for c in leaves): split_select["."].source_path = "." elif is_op(select.value, Variable): for selected_column in schema.values(select.value.var, exclude_type=(OBJECT, EXISTS)): if selected_column.jx_type == NESTED: expand_split_select( selected_column.es_column ).source_path = selected_column.es_column continue leaves = schema.leaves(selected_column.es_column) for c in leaves: if c.jx_type == NESTED: split_select[c.es_column].source_path = c.es_column # IF WE GET THE SOURCE FOR PARENT, WE ASSUME WE GOT SOURCE FOR CHILD source_path = None source_level = 0 for level, es_select in enumerate(reversed(list(split_select.values()))): if source_path: es_select.source_path = source_path elif es_select.source_path: source_level = level + 1 source_path = es_select.source_path def get_pull_source(c): nested_path = c.nested_path nested_level = len(nested_path) pos = text(nested_level) if nested_level <= query_level: if not source_level or nested_level < source_level: field = join_field([pos, "fields", c.es_column]) return jx_expression_to_function(field) elif nested_level == source_level: field = relative_field(c.es_column, nested_path[0]) def pull_source(row): return untyped(row.get(pos, Null)._source[field]) return pull_source else: field = relative_field(c.es_column, nested_path[0]) def pull_property(row): return untyped(row.get(pos, Null)[field]) return pull_property else: pos = text(query_level) if not source_level or nested_level < source_level: # PULL FIELDS AND THEN AGGREGATE THEM value = jx_expression_to_function( join_field(["fields", c.es_column])) name = literal_field(nested_path[0]) index = jx_expression_to_function("_nested.offset") def pull_nested_field(doc): hits = doc.get(pos, Null).inner_hits[name].hits.hits if not hits: return [] temp = [(index(h), value(h)) for h in hits] acc = [None] * len(temp) for i, v in temp: acc[i] = unwraplist(v) return acc return pull_nested_field else: # PULL SOURCES value = jx_expression_to_function( concat_field("_source", relative_field(c.es_column, nested_path[0]))) name = literal_field(nested_path[0]) index = jx_expression_to_function( join_field(["_nested"] * (len(c.nested_path) - 1) + ["offset"])) def pull_nested_source(doc): hits = doc.get(pos, Null).inner_hits[name].hits.hits if not hits: return [] temp = [(index(h), value(h)) for h in hits] acc = [None] * len(temp) for i, v in temp: acc[i] = untyped(v) return acc return pull_nested_source put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: c_nested_path = c.nested_path[0] simple_name = relative_field(c.es_column, query_path).lstrip(".") name = concat_field(select.name, untype_path(simple_name)) put_name = concat_field( select.name, literal_field(untype_path(simple_name))) split_select[c_nested_path].fields.append(c.es_column) new_select.append({ "name": name, "value": Variable(c.es_column), "put": { "name": put_name, "index": put_index, "child": ".", }, "pull": get_pull_source(c), }) put_index += 1 elif is_op(select.value, Variable): if select.value.var == ".": # PULL ALL SOURCE new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": "." }, "pull": get_pull_source( Data(es_column=query_path, nested_path=schema.query_path)), }) continue for selected_column in schema.values(select.value.var, exclude_type=(EXISTS, OBJECT)): if selected_column.jx_type == NESTED: new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": "." }, "pull": get_pull_source( Data( es_column=selected_column.es_column, nested_path=(selected_column.es_column, ) + selected_column.nested_path, )), }) continue leaves = schema.leaves( selected_column.es_column, exclude_type=INTERNAL) # LEAVES OF OBJECT if leaves: for c in leaves: if c.es_column == "_id": new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": ".", }, "pull": pull_id, }) continue c_nested_path = c.nested_path[0] expand_split_select(c_nested_path).fields.append( c.es_column) child = untype_path( relative_field( c.es_column, selected_column.es_column, )) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": child, }, "pull": get_pull_source(c), }) else: new_select.append({ "name": select.name, "value": NULL, "put": { "name": select.name, "index": put_index, "child": "." }, }) put_index += 1 else: op, split_scripts = split_expression_by_path(select.value, schema, lang=Painless) for p, script in split_scripts.items(): es_select = split_select[p] es_select.scripts[select.name] = { "script": text(Painless[script].partial_eval().to_es_script(schema)) } new_select.append({ "name": select.name, "pull": jx_expression_to_function( join_field([ text(p), "fields", select.name, ])), "put": { "name": select.name, "index": put_index, "child": "." }, }) put_index += 1 def inners(query_path, parent_pos): """ :param query_path: :return: ITERATOR OVER TUPLES ROWS AS TUPLES, WHERE row[len(nested_path)] HAS INNER HITS AND row[0] HAS post_expressions """ pos = text(int(parent_pos) + 1) if not query_path: def base_case(row): extra = {} for k, e in post_expressions.items(): extra[k] = e(row) row["0"] = extra yield row return base_case if pos == "1": more = inners(query_path[:-1], "1") def first_case(results): for result in results: for hit in result.hits.hits: seed = {"0": None, pos: hit} for row in more(seed): yield row return first_case else: more = inners(query_path[:-1], pos) if source_path and source_path < query_path[-1]: rel_path = relative_field(query_path[-1], source_path) def source(acc): for inner_row in acc[parent_pos][rel_path]: acc[pos] = inner_row for tt in more(acc): yield tt return source else: path = literal_field(query_path[-1]) def recurse(acc): hits = acc[parent_pos].inner_hits[path].hits.hits if hits: for inner_row in hits: acc[pos] = inner_row for tt in more(acc): yield tt else: for tt in more(acc): yield tt return recurse return new_select, split_select, inners(schema.query_path, "0")