def __getitem__(self, item): # TODO: SOLVE FUNDAMENTAL QUESTION OF IF SELECTING A PART OF AN # EDGE REMOVES THAT EDGE FROM THIS RESULT, OR ADDS THE PART # AS A select {"name":edge.name, "value":edge.domain.partitions[coord]} # PROBABLY NOT, THE value IS IDENTICAL OVER THE REMAINING if is_data(item): coordinates = [None] * len(self.edges) # MAP DICT TO NUMERIC INDICES for name, v in item.items(): ei, parts = list_to_data([(i, e.domain.partitions) for i, e in enumerate(self.edges) if e.name == name])[0] if not parts: Log.error( "Can not find {{name}}=={{value|quote}} in list of edges, maybe this feature is not implemented yet", name=name, value=v) part = list_to_data([p for p in parts if p.value == v])[0] if not part: return Null else: coordinates[ei] = part.dataIndex edges = [e for e, v in zip(self.edges, coordinates) if v is None] if not edges: # ZERO DIMENSIONAL VALUE return dict_to_data({ k: v.__getitem__(coordinates) for k, v in self.data.items() }) else: output = Cube(select=self.select, edges=list_to_data([ e for e, v in zip(self.edges, coordinates) if v is None ]), data={ k: Matrix(values=c.__getitem__(coordinates)) for k, c in self.data.items() }) return output elif is_text(item): # RETURN A VALUE CUBE if self.is_value: if item != self.select.name: Log.error("{{name}} not found in cube", name=item) return self if item not in self.select.name: Log.error("{{name}} not found in cube", name=item) output = Cube(select=[s for s in self.select if s.name == item][0], edges=self.edges, data={item: self.data[item]}) return output else: Log.error("not implemented yet")
def __init__(self, select, edges, data, frum=None): """ data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE ALLOWED, USING THE select AND edges TO DESCRIBE THE data """ self.is_value = False if is_list(select) else True self.select = select self.meta = Data(format="cube") # PUT EXTRA MARKUP HERE self.is_none = False if not all(data.values()): is_none = True # ENSURE frum IS PROPER FORM if is_list(select): if edges and OR(not isinstance(v, Matrix) for v in data.values()): Log.error("Expecting data to be a dict with Matrix values") if not edges: if not data: if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix.ZERO} self.edges = Null elif is_data(data): # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA length = MAX([len(v) for v in data.values()]) if length >= 1: self.edges = list_to_data([{"name": "rownum", "domain": {"type": "rownum"}}]) else: self.edges = Null elif is_list(data): if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix.wrap(data)} self.edges = list_to_data([{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}]) elif isinstance(data, Matrix): if is_list(select): Log.error("not expecting a list of records") data = {select.name: data} else: if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix(value=data)} self.edges = Null else: self.edges = to_data(edges) self.data = data
def search(self, query): query = to_data(query) f = jx.get(query.query.filtered.filter) filtered = list_to_data([{ "_id": i, "_source": d } for i, d in self.data.items() if f(d)]) if query.fields: return dict_to_data({ "hits": { "total": len(filtered), "hits": [{ "_id": d._id, "fields": unwrap( jx.select([unwrap(d._source)], query.fields)[0]) } for d in filtered] } }) else: return dict_to_data( {"hits": { "total": len(filtered), "hits": filtered }})
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "time" self.NULL = Null self.min = Date(self.min) self.max = Date(self.max) self.interval = Duration(self.interval) self.sort = Null if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") Log.error("not implemented yet") # VERIFY PARTITIONS DO NOT OVERLAP return self.verify_attributes_not_null(["min", "max", "interval"]) self.key = "min" self.partitions = list_to_data([{ "min": v, "max": v + self.interval, "dataIndex": i } for i, v in enumerate(Date.range(self.min, self.max, self.interval)) ])
def filter(data, where): """ where - a function that accepts (record, rownum, rows) and returns boolean """ if len(data) == 0 or where == None or where == TRUE: return data if isinstance(data, Container): return data.filter(where) if is_container(data): temp = get(where) dd = to_data(data) return list_to_data( [unwrap(d) for i, d in enumerate(data) if temp(to_data(d), i, dd)]) else: Log.error("Do not know how to handle type {{type}}", type=data.__class__.__name__) try: return drill_filter(where, data) except Exception as _: # WOW! THIS IS INEFFICIENT! return to_data([ unwrap(d) for d in drill_filter(where, [DataObject(d) for d in data]) ])
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "duration" self.NULL = Null self.min = Duration(self.min) self.max = Duration(self.max) self.interval = Duration(self.interval) if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") Log.error("not implemented yet") # VERIFY PARTITIONS DO NOT OVERLAP return elif not all([self.min, self.max, self.interval]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = list_to_data([{ "min": v, "max": v + self.interval, "dataIndex": i } for i, v in enumerate( Duration.range(self.min, self.max, self.interval))])
def _groupby(self, edges): """ RETURNS LIST OF (coord, values) TUPLES, WHERE coord IS THE INDEX INTO self CUBE (-1 INDEX FOR COORDINATES NOT GROUPED BY) values ALL VALUES THAT BELONG TO THE SLICE """ edges = FlatList([n for e in edges for n in _normalize_edge(e)]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[ getKey[i](p) for p in e.domain.partitions + ([None] if e.allowNulls else []) ] for i, e in enumerate(self.edges)] def coord2term(coord): output = leaves_to_data( {keys[i]: lookup[i][c] for i, c in enumerate(coord)}) return output if is_list(self.select): selects = listwrap(self.select) index, v = transpose(*self.data[selects[0].name].groupby(selector)) coord = list_to_data([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = transpose(*self.data[s.name].group_by(selector)) values.append(v) output = transpose(coord, [ Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values) ]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( (coord2term(coord), v) for coord, v in self.data[self.select.name].groupby(selector)) else: output = ( (coord2term(coord), Cube(self.select, remainder, v)) for coord, v in self.data[self.select.name].groupby(selector)) return output
def get_columns(data, leaves=False): # TODO Split this into two functions if not leaves: return list_to_data([{ "name": n } for n in UNION(set(d.keys()) for d in data)]) else: return to_data([{ "name": leaf } for leaf in set(leaf for row in data for leaf, _ in row.leaves())])
def done_count(self): columns = list(map(text, range(len(self.fields)))) parts = list_to_data([{text(i): p for i, p in enumerate(part)} for part in set(self.parts)]) self.parts = None sorted_parts = jx.sort(parts, columns) self.edge.domain = self.domain = SimpleSetDomain( key="value", partitions=[{"value": tuple(v[k] for k in columns), "dataIndex": i} for i, v in enumerate(sorted_parts)] )
def es_query_template(path): """ RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE :param path: THE NESTED PATH (NOT INCLUDING TABLE NAME) :return: (es_query, es_filters) TUPLE """ if not is_text(path): Log.error("expecting path to be a string") if path != ".": f0 = {} f1 = {} output = dict_to_data({ "query": es_and([ f0, { "nested": { "path": path, "query": f1, "inner_hits": { "size": 100000 } } } ]), "from": 0, "size": 0, "sort": [] }) return output, list_to_data([f0, f1]) else: f0 = {} output = dict_to_data({ "query": es_and([f0]), "from": 0, "size": 0, "sort": [] }) return output, list_to_data([f0])
def groupby(self, edges): """ SLICE THIS CUBE IN TO ONES WITH LESS DIMENSIONALITY simple==True WILL HAVE GROUPS BASED ON PARTITION VALUE, NOT PARTITION OBJECTS """ edges = FlatList([n for e in edges for n in _normalize_edge(e)]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[ getKey[i](p) for p in e.domain.partitions + ([None] if e.allowNulls else []) ] for i, e in enumerate(self.edges)] def coord2term(coord): output = leaves_to_data( {keys[i]: lookup[i][c] for i, c in enumerate(coord)}) return output if is_list(self.select): selects = listwrap(self.select) index, v = transpose(*self.data[selects[0].name].groupby(selector)) coord = list_to_data([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = zip(*self.data[s.name].group_by(selector)) values.append(v) output = transpose(coord, [ Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values) ]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( (coord2term(coord), v) for coord, v in self.data[self.select.name].groupby(selector)) else: output = ( (coord2term(coord), Cube(self.select, remainder, v)) for coord, v in self.data[self.select.name].groupby(selector)) return output
def _normalize_groupby(groupby, limit, schema=None): if groupby == None: return None output = list_to_data([ n for e in listwrap(groupby) for n in _normalize_group(e, None, limit, schema=schema) ]) for i, o in enumerate(output): o.dim = i if any(o == None for o in output): Log.error("not expected") return output
def map(self, map_): def map_select(s, map_): return set_default({"value": s.value.map(map_)}, s) def map_edge(e, map_): partitions = unwraplist([ set_default({"where": p.where.map(map_)}, p) for p in e.domain.partitions ]) domain = copy(e.domain) domain.where = e.domain.where.map(map_) domain.partitions = partitions edge = copy(e) edge.value = e.value.map(map_) edge.domain = domain if e.range: edge.range.min = e.range.min.map(map_) edge.range.max = e.range.max.map(map_) return edge if is_list(self.select): select = list_to_data([map_select(s, map_) for s in self.select]) else: select = map_select(self.select, map_) return QueryOp( frum=self.frum.map(map_), select=select, edges=list_to_data([map_edge(e, map_) for e in self.edges]), groupby=list_to_data([g.map(map_) for g in self.groupby]), window=list_to_data([w.map(map_) for w in self.window]), where=self.where.map(map_), sort=list_to_data( [map_select(s, map_) for s in listwrap(self.sort)]), limit=self.limit, format=self.format, )
def get_meta(self, key, conforming=True): """ RETURN METADATA ON FILE IN BUCKET :param key: KEY, OR PREFIX OF KEY :param conforming: TEST IF THE KEY CONFORMS TO REQUIRED PATTERN :return: METADATA, IF UNIQUE, ELSE ERROR """ try: metas = list(self.bucket.list(prefix=str(key))) metas = list_to_data([m for m in metas if text(m.name).find(".json") != -1]) perfect = Null favorite = Null too_many = False error = None for m in metas: try: simple = strip_extension(m.key) if conforming: self._verify_key_format(simple) if simple == key: perfect = m too_many = False if simple.startswith(key + ".") or simple.startswith(key + ":"): if favorite and not perfect: too_many = True favorite = m except Exception as e: error = e if too_many: Log.error( "multiple keys in {{bucket}} with prefix={{prefix|quote}}: {{list}}", bucket=self.name, prefix=key, list=[k.name for k in metas], ) if not perfect and error: Log.error("Problem with key request", error) return coalesce(perfect, favorite) except Exception as e: Log.error( READ_ERROR + " can not read {{key}} from {{bucket}}", key=key, bucket=self.bucket.name, cause=e, )
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "range" self.NULL = Null if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") parts = listwrap(self.partitions) for i, p in enumerate(parts): self.min = MIN([self.min, p.min]) self.max = MAX([self.max, p.max]) if p.dataIndex != None and p.dataIndex != i: Log.error( "Expecting `dataIndex` to agree with the order of the parts" ) if p[self.key] == None: Log.error( "Expecting all parts to have {{key}} as a property", key=self.key) p.dataIndex = i # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE for p, q in itertools.product(parts, parts): if p.min <= q.min and q.min < p.max and unwrap( p) is not unwrap(q): Log.error("partitions overlap!") self.partitions = to_data(parts) return elif any([self.min == None, self.max == None, self.interval == None]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = list_to_data([{ "min": v, "max": v + self.interval, "dataIndex": i } for i, v in enumerate(frange(self.min, self.max, self.interval))])
def drill_filter(esfilter, data): """ PARTIAL EVALUATE THE FILTER BASED ON DATA GIVEN TODO: FIX THIS MONUMENTALLY BAD IDEA """ esfilter = unwrap(esfilter) primary_nested = [] # track if nested, changes if not primary_column = [] # only one path allowed primary_branch = ( [] ) # CONTAINS LISTS OF RECORDS TO ITERATE: constantly changing as we dfs the tree def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception as e: Log.error("{{name}} does not exist", name=fieldname) if is_list(d) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1:]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d]) return fieldname, None def pe_filter(filter, data, depth): """ PARTIAL EVALUATE THE filter BASED ON data GIVEN """ if filter is TRUE: return True if filter is FALSE: return False filter = to_data(filter) if filter["and"]: result = True output = FlatList() for a in filter["and"]: f = pe_filter(a, data, depth) if f is False: result = False elif f is not True: output.append(f) if result and output: return {"and": output} else: return result elif filter["or"]: output = FlatList() for o in filter["or"]: f = pe_filter(o, data, depth) if f is True: return True elif f is not False: output.append(f) if output: return {"or": output} else: return False elif filter["not"]: f = pe_filter(filter["not"], data, depth) if f is True: return False elif f is False: return True else: return {"not": f} elif filter.term or filter.eq: eq = coalesce(filter.term, filter.eq) result = True output = {} for col, val in eq.items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d != val: result = False else: output[rest] = val if result and output: return {"term": output} else: return result elif filter.equal: a, b = filter["equal"] first_a, rest_a = parse_field(a, data, depth) first_b, rest_b = parse_field(b, data, depth) val_a = data[first_a] val_b = data[first_b] if not rest_a: if not rest_b: if val_a != val_b: return False else: return True else: return {"term": {rest_b: val_a}} else: if not rest_b: return {"term": {rest_a: val_b}} else: return {"equal": [rest_a, rest_b]} elif filter.terms: result = True output = {} for col, vals in filter["terms"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d not in vals: result = False else: output[rest] = vals if result and output: return {"terms": output} else: return result elif filter.range: result = True output = {} for col, ranges in filter["range"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: for sign, val in ranges.items(): if sign in ("gt", ">") and d <= val: result = False if sign == "gte" and d < val: result = False if sign == "lte" and d > val: result = False if sign == "lt" and d >= val: result = False else: output[rest] = ranges if result and output: return {"range": output} else: return result elif filter.missing: if is_text(filter.missing): field = filter["missing"] else: field = filter["missing"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d == None: return True return False else: return {"missing": rest} elif filter.prefix: result = True output = {} for col, val in filter["prefix"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d == None or not d.startswith(val): result = False else: output[rest] = val if result and output: return {"prefix": output} else: return result elif filter.exists: if is_text(filter["exists"]): field = filter["exists"] else: field = filter["exists"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d != None: return True return False else: return {"exists": rest} else: Log.error("Can not interpret esfilter: {{esfilter}}", {"esfilter": filter}) output = [] # A LIST OF OBJECTS MAKING THROUGH THE FILTER def main(sequence, esfilter, row, depth): """ RETURN A SEQUENCE OF REFERENCES OF OBJECTS DOWN THE TREE SHORT SEQUENCES MEANS ALL NESTED OBJECTS ARE INCLUDED """ new_filter = pe_filter(esfilter, row, depth) if new_filter is True: seq = list(sequence) seq.append(row) output.append(seq) return elif new_filter is False: return seq = list(sequence) seq.append(row) for d in primary_branch[depth]: main(seq, new_filter, d, depth + 1) # OUTPUT for i, d in enumerate(data): if is_data(d): main([], esfilter, to_data(d), 0) else: Log.error("filter is expecting a dict, not {{type}}", type=d.__class__) # AT THIS POINT THE primary_column[] IS DETERMINED # USE IT TO EXPAND output TO ALL NESTED OBJECTS max = 0 # EVEN THOUGH A ROW CAN HAVE MANY VALUES, WE ONLY NEED UP TO max for i, n in enumerate(primary_nested): if n: max = i + 1 # OUTPUT IS A LIST OF ROWS, # WHERE EACH ROW IS A LIST OF VALUES SEEN DURING A WALK DOWN A PATH IN THE HIERARCHY uniform_output = FlatList() def recurse(row, depth): if depth == max: uniform_output.append(row) else: nested = row[-1][primary_column[depth]] if not nested: # PASSED FILTER, BUT NO CHILDREN, SO ADD NULL CHILDREN for i in range(depth, max): row.append(None) uniform_output.append(row) else: for d in nested: r = list(row) r.append(d) recurse(r, depth + 1) for o in output: recurse(o, 0) if not max: # SIMPLE LIST AS RESULT return list_to_data([unwrap(u[0]) for u in uniform_output]) return PartFlatList(primary_column[0:max], uniform_output)
def get_selects(query): schema = query.frum.schema query_level = len(schema.query_path) query_path = schema.query_path[0] # SPLIT select INTO ES_SELECT AND RESULTSET SELECT split_select = OrderedDict((p, ESSelectOp(p)) for p in schema.query_path) def expand_split_select(c_nested_path): es_select = split_select.get(c_nested_path) if not es_select: temp = [(k, v) for k, v in split_select.items()] split_select.clear() split_select.update({c_nested_path: ESSelectOp(c_nested_path)}) split_select.update(temp) return split_select[c_nested_path] new_select = FlatList() post_expressions = {} selects = list_to_data([unwrap(s.copy()) for s in listwrap(query.select)]) # WHAT PATH IS _source USED, IF ANY? for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) if any(c.jx_type == NESTED for c in leaves): split_select["."].source_path = "." elif is_op(select.value, Variable): for selected_column in schema.values(select.value.var, exclude_type=(OBJECT, EXISTS)): if selected_column.jx_type == NESTED: expand_split_select( selected_column.es_column ).source_path = selected_column.es_column continue leaves = schema.leaves(selected_column.es_column) for c in leaves: if c.jx_type == NESTED: split_select[c.es_column].source_path = c.es_column # IF WE GET THE SOURCE FOR PARENT, WE ASSUME WE GOT SOURCE FOR CHILD source_path = None source_level = 0 for level, es_select in enumerate(reversed(list(split_select.values()))): if source_path: es_select.source_path = source_path elif es_select.source_path: source_level = level + 1 source_path = es_select.source_path def get_pull_source(c): nested_path = c.nested_path nested_level = len(nested_path) pos = text(nested_level) if nested_level <= query_level: if not source_level or nested_level < source_level: field = join_field([pos, "fields", c.es_column]) return jx_expression_to_function(field) elif nested_level == source_level: field = relative_field(c.es_column, nested_path[0]) def pull_source(row): return untyped(row.get(pos, Null)._source[field]) return pull_source else: field = relative_field(c.es_column, nested_path[0]) def pull_property(row): return untyped(row.get(pos, Null)[field]) return pull_property else: pos = text(query_level) if not source_level or nested_level < source_level: # PULL FIELDS AND THEN AGGREGATE THEM value = jx_expression_to_function( join_field(["fields", c.es_column])) name = literal_field(nested_path[0]) index = jx_expression_to_function("_nested.offset") def pull_nested_field(doc): hits = doc.get(pos, Null).inner_hits[name].hits.hits if not hits: return [] temp = [(index(h), value(h)) for h in hits] acc = [None] * len(temp) for i, v in temp: acc[i] = unwraplist(v) return acc return pull_nested_field else: # PULL SOURCES value = jx_expression_to_function( concat_field("_source", relative_field(c.es_column, nested_path[0]))) name = literal_field(nested_path[0]) index = jx_expression_to_function( join_field(["_nested"] * (len(c.nested_path) - 1) + ["offset"])) def pull_nested_source(doc): hits = doc.get(pos, Null).inner_hits[name].hits.hits if not hits: return [] temp = [(index(h), value(h)) for h in hits] acc = [None] * len(temp) for i, v in temp: acc[i] = untyped(v) return acc return pull_nested_source put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: c_nested_path = c.nested_path[0] simple_name = relative_field(c.es_column, query_path).lstrip(".") name = concat_field(select.name, untype_path(simple_name)) put_name = concat_field( select.name, literal_field(untype_path(simple_name))) split_select[c_nested_path].fields.append(c.es_column) new_select.append({ "name": name, "value": Variable(c.es_column), "put": { "name": put_name, "index": put_index, "child": ".", }, "pull": get_pull_source(c), }) put_index += 1 elif is_op(select.value, Variable): if select.value.var == ".": # PULL ALL SOURCE new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": "." }, "pull": get_pull_source( Data(es_column=query_path, nested_path=schema.query_path)), }) continue for selected_column in schema.values(select.value.var, exclude_type=(EXISTS, OBJECT)): if selected_column.jx_type == NESTED: new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": "." }, "pull": get_pull_source( Data( es_column=selected_column.es_column, nested_path=(selected_column.es_column, ) + selected_column.nested_path, )), }) continue leaves = schema.leaves( selected_column.es_column, exclude_type=INTERNAL) # LEAVES OF OBJECT if leaves: for c in leaves: if c.es_column == "_id": new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": ".", }, "pull": pull_id, }) continue c_nested_path = c.nested_path[0] expand_split_select(c_nested_path).fields.append( c.es_column) child = untype_path( relative_field( c.es_column, selected_column.es_column, )) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": child, }, "pull": get_pull_source(c), }) else: new_select.append({ "name": select.name, "value": NULL, "put": { "name": select.name, "index": put_index, "child": "." }, }) put_index += 1 else: op, split_scripts = split_expression_by_path(select.value, schema, lang=Painless) for p, script in split_scripts.items(): es_select = split_select[p] es_select.scripts[select.name] = { "script": text(Painless[script].partial_eval().to_es_script(schema)) } new_select.append({ "name": select.name, "pull": jx_expression_to_function( join_field([ text(p), "fields", select.name, ])), "put": { "name": select.name, "index": put_index, "child": "." }, }) put_index += 1 def inners(query_path, parent_pos): """ :param query_path: :return: ITERATOR OVER TUPLES ROWS AS TUPLES, WHERE row[len(nested_path)] HAS INNER HITS AND row[0] HAS post_expressions """ pos = text(int(parent_pos) + 1) if not query_path: def base_case(row): extra = {} for k, e in post_expressions.items(): extra[k] = e(row) row["0"] = extra yield row return base_case if pos == "1": more = inners(query_path[:-1], "1") def first_case(results): for result in results: for hit in result.hits.hits: seed = {"0": None, pos: hit} for row in more(seed): yield row return first_case else: more = inners(query_path[:-1], pos) if source_path and source_path < query_path[-1]: rel_path = relative_field(query_path[-1], source_path) def source(acc): for inner_row in acc[parent_pos][rel_path]: acc[pos] = inner_row for tt in more(acc): yield tt return source else: path = literal_field(query_path[-1]) def recurse(acc): hits = acc[parent_pos].inner_hits[path].hits.hits if hits: for inner_row in hits: acc[pos] = inner_row for tt in more(acc): yield tt else: for tt in more(acc): yield tt return recurse return new_select, split_select, inners(schema.query_path, "0")
def get_selects(query): schema = query.frum.schema split_select = {".": ESSelectOp(".")} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelectOp(path) return es_select selects = list_to_data([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field( select.name, relative_field(untype_path(c.name), term.var)) if c.jx_type == NESTED: get_select(".").get_source = True new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": ".", }, "pull": get_pull_source(c.es_column), }) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": { "name": literal_field(full_name), "index": put_index, "child": ".", }, }) put_index += 1 elif is_op(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select(".").get_source = True new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": "." }, "pull": get_pull_source("."), }) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select(".").get_source = True for c in leaves: if len( c.nested_path ) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field( decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field(pre_child, s_column)), }, "pull": get_pull_source(c.es_column), }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": ".", }, "pull": lambda row: row._id, }) elif c.jx_type == NESTED: get_select(".").get_source = True pre_child = join_field( decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field( pre_child, s_column)), }, "pull": get_pull_source(c.es_column), }) else: get_select(c_nested_path).fields.append( c.es_column) pre_child = join_field( decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": untype_path( relative_field( pre_child, s_column)), }, }) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field( untype_path( relative_field(c.name, schema.query_path[0])), s_column, ) pull = accumulate_nested_doc( c_nested_path, Variable( relative_field( s_column, unnest_path(c_nested_path))), ) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child, }, "pull": pull, }) else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": { "name": select.name, "index": put_index, "child": "." }, }) put_index += 1 else: op, split_scripts = split_expression_by_path(select.value, schema, lang=Painless) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = { "script": text(Painless[script].partial_eval().to_es_script(schema)) } new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": { "name": select.name, "index": put_index, "child": "." }, }) put_index += 1 for n in new_select: if n.pull: continue elif is_op(n.value, Variable): if get_select(".").get_source: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function( concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") return new_select, split_select
def _normalize_group(edge, dim_index, limit, schema=None): """ :param edge: Not normalized groupby :param dim_index: Dimensions are ordered; this is this groupby's index into that order :param schema: for context :return: a normalized groupby """ if is_text(edge): if edge.endswith(".*"): prefix = edge[:-2] if schema: output = list_to_data([ { # BECASUE THIS IS A GROUPBY, EARLY SPLIT INTO LEAVES WORKS JUST FINE "name": concat_field( prefix, literal_field( relative_field(untype_path(c.name), prefix))), "put": { "name": literal_field(untype_path(c.name)) }, "value": jx_expression(c.es_column, schema=schema), "allowNulls": True, "domain": { "type": "default" } } for c in schema.leaves(prefix) ]) return output else: return list_to_data([{ "name": untype_path(prefix), "put": { "name": literal_field(untype_path(prefix)) }, "value": LeavesOp(Variable(prefix)), "allowNulls": True, "dim": dim_index, "domain": { "type": "default" } }]) return list_to_data([{ "name": edge, "value": jx_expression(edge, schema=schema), "allowNulls": True, "dim": dim_index, "domain": Domain(type="default", limit=limit) }]) else: edge = to_data(edge) if (edge.domain and edge.domain.type != "default"): Log.error("groupby does not accept complicated domains") if not edge.name and not is_text(edge.value): Log.error("You must name compound edges: {{edge}}", edge=edge) return list_to_data([{ "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value, schema=schema), "allowNulls": True, "dim": dim_index, "domain": { "type": "default" } }])
def _normalize_edges(edges, limit, schema=None): return list_to_data([ n for ie, e in enumerate(listwrap(edges)) for n in _normalize_edge(e, ie, limit=limit, schema=schema) ])
def es_deepop(es, query): schema = query.frum.schema query_path = schema.query_path[0] # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es_query_template(query_path) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, schema) for f, w in zip_longest(es_filters, wheres): script = ES52[AndOp(w).partial_eval()].to_es(schema) set_default(f, script) if not wheres[1]: # INCLUDE DOCS WITH NO NESTED DOCS more_filter = { "bool": { "filter": [AndOp(wheres[0]).partial_eval().to_es(schema)], "must_not": { "nested": { "path": query_path, "query": MATCH_ALL } } } } else: more_filter = None es_query.size = coalesce(query.limit, DEFAULT_LIMIT) # es_query.sort = jx_sort_to_es_sort(query.sort) map_to_es_columns = schema.map_to_es() # {c.name: c.es_column for c in schema.leaves(".")} query_for_es = query.map(map_to_es_columns) es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema) es_query.stored_fields = [] is_list = is_list_(query.select) selects = list_to_data([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS leaves = schema.leaves(select.value.term.var) col_names = set() for c in leaves: if c.nested_path[0] == ".": if c.jx_type in INTERNAL: continue es_query.stored_fields += [c.es_column] c_name = untype_path(relative_field(c.name, query_path)) col_names.add(c_name) new_select.append({ "name": concat_field(select.name, c_name), "nested_path": c.nested_path[0], "put": { "name": concat_field(select.name, literal_field(c_name)), "index": put_index, "child": "." }, "pull": get_pull_function(c) }) put_index += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS for n in new_select: if n.name.startswith("..") and n.name.lstrip( ".") not in col_names: n.put.name = n.name = n.name.lstrip(".") col_names.add(n.name) elif is_op(select.value, Variable): net_columns = schema.leaves(select.value.var) if not net_columns: new_select.append({ "name": select.name, "nested_path": ".", "put": { "name": select.name, "index": put_index, "child": "." }, "pull": NULL }) else: for n in net_columns: if n.nested_path[0] == ".": if n.jx_type == NESTED: continue es_query.stored_fields += [n.es_column] if len(n.nested_path[0]) > len(query_path): # SELECTING INNER PROPERTIES IS NOT ALLOWED continue # WE MUST FIGURE OUT WHICH NAMESPACE s.value.var IS USING SO WE CAN EXTRACT THE child for np in n.nested_path: c_name = untype_path(relative_field(n.name, np)) if startswith_field(c_name, select.value.var): child = relative_field(c_name, select.value.var) break else: raise Log.error("Not expected") pull = get_pull_function(n) new_select.append({ "name": select.name, "pull": pull, "nested_path": n.nested_path[0], "put": { "name": select.name, "index": put_index, "child": child } }) put_index += 1 else: expr = select.value for v in expr.vars(): for c in schema[v.var]: if c.nested_path[0] == ".": es_query.stored_fields += [c.es_column] # else: # Log.error("deep field not expected") pull_name = EXPRESSION_PREFIX + select.name map_to_local = MapToLocal(schema) pull = jx_expression_to_function(pull_name) post_expressions[pull_name] = jx_expression_to_function( expr.map(map_to_local)) new_select.append({ "name": select.name if is_list else ".", "pull": pull, "value": expr.__data__(), "put": { "name": select.name, "index": put_index, "child": "." } }) put_index += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append( es.search( Data(query=more_filter, stored_fields=es_query.stored_fields))) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es.search(es_query) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t # </COMPLICATED> try: formatter, row_formatter, mime_type = set_formatters[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
{"a": "x", "t": Date("today-2day").unix, "v": 3}, {"a": "x", "t": Date("today-3day").unix, "v": 5}, {"a": "x", "t": Date("today-4day").unix, "v": 7}, {"a": "x", "t": Date("today-5day").unix, "v": 11}, {"a": "x", "t": null, "v": 27}, {"a": "y", "t": Date("today-day").unix, "v": 13}, {"a": "y", "t": Date("today-2day").unix, "v": 17}, {"a": "y", "t": Date("today-4day").unix, "v": 19}, {"a": "y", "t": Date("today-5day").unix, "v": 23} ] expected_list_1 = list_to_data([ {"t": (TODAY - WEEK).unix, "v": NULL}, {"t": (TODAY - 6 * DAY).unix, "v": NULL}, {"t": (TODAY - 5 * DAY).unix, "v": 34}, {"t": (TODAY - 4 * DAY).unix, "v": 26}, {"t": (TODAY - 3 * DAY).unix, "v": 5}, {"t": (TODAY - 2 * DAY).unix, "v": 20}, {"t": (TODAY - 1 * DAY).unix, "v": 15}, {"v": 29} ]) expected2 = list_to_data([ {"a": "x", "t": (TODAY - WEEK).unix, "v": NULL}, {"a": "x", "t": (TODAY - 6 * DAY).unix, "v": NULL}, {"a": "x", "t": (TODAY - 5 * DAY).unix, "v": 11}, {"a": "x", "t": (TODAY - 4 * DAY).unix, "v": 7}, {"a": "x", "t": (TODAY - 3 * DAY).unix, "v": 5}, {"a": "x", "t": (TODAY - 2 * DAY).unix, "v": 3}, {"a": "x", "t": (TODAY - 1 * DAY).unix, "v": 2}, {"a": "x", "v": 29},
def table2list( column_names, # tuple of columns names rows # list of tuples ): return list_to_data([dict(zip(column_names, r)) for r in rows])
# # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. # # Contact: Kyle Lahnakoski ([email protected]) # from __future__ import absolute_import, division, unicode_literals from jx_base.expressions import NULL from mo_dots import list_to_data from tests.test_jx import BaseTestCase, TEST_TABLE lots_of_data = list_to_data([{"a": i} for i in range(30)]) class TestFilters(BaseTestCase): def test_where_expression(self): test = { "data": [ # PROPERTIES STARTING WITH _ ARE NESTED AUTOMATICALLY { "a": { "b": 0, "c": 0 } }, { "a": { "b": 0,