def format_cube(T, select, query=None): with Timer("format table"): table = format_table(T, select, query) if len(table.data) == 0: return Cube( select, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": 0, "interval": 1 } }], data={h: Matrix(list=[]) for i, h in enumerate(table.header)}) cols = transpose(*unwrap(table.data)) return Cube( select, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": len(table.data), "interval": 1 } }], data={h: Matrix(list=cols[i]) for i, h in enumerate(table.header)})
def es_fieldop(es, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } } FromES.size = coalesce(query.limit, 200000) FromES.fields = FlatList() for s in select.value: if s == "*": FromES.fields = None elif isinstance(s, list): FromES.fields.extend(s) elif isinstance(s, Mapping): FromES.fields.extend(s.values()) else: FromES.fields.append(s) FromES.sort = [{ s.field: "asc" if s.sort >= 0 else "desc" } for s in query.sort] data = es09.util.post(es, FromES, query.limit) T = data.hits.hits matricies = {} for s in select: if s.value == "*": matricies[s.name] = Matrix.wrap([t._source for t in T]) elif isinstance(s.value, Mapping): # for k, v in s.value.items(): # matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T]) matricies[s.name] = Matrix.wrap([{ k: unwrap(t.fields).get(v, None) for k, v in s.value.items() } for t in T]) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([ tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T ]) elif not s.value: matricies[s.name] = Matrix.wrap( [unwrap(t.fields).get(s.value, None) for t in T]) else: try: matricies[s.name] = Matrix.wrap( [unwrap(t.fields).get(s.value, None) for t in T]) except Exception as e: Log.error("", e) cube = Cube(query.select, query.edges, matricies, frum=query) cube.frum = query return cube
def data(): dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) if query.sort and not query.groupby: # TODO: USE THE format_table() TO PRODUCE THE NEEDED VALUES INSTEAD OF DUPLICATING LOGIC HERE all_coord = is_sent._all_combos( ) # TRACK THE EXPECTED COMBINATIONS for _, coord, agg in aggs_iterator(aggs, decoders): missing_coord = all_coord.next() while coord != missing_coord: # INSERT THE MISSING COORDINATE INTO THE GENERATION output = Data() for i, d in enumerate(decoders): output[query.edges[i].name] = d.get_value( missing_coord[i]) for s in select: if s.aggregate == "count": output[s.name] = 0 yield output missing_coord = all_coord.next() output = Data() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = s.pull(agg) yield output else: for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = Data() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = s.pull(agg) yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for c, v in is_sent: if not v: output = Data() for i, d in enumerate(decoders): output[query.edges[i].name] = d.get_value(c[i]) for s in select: if s.aggregate == "count": output[s.name] = 0 yield output
def __init__(self, select, edges, data, frum=None): """ data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE ALLOWED, USING THE select AND edges TO DESCRIBE THE data """ self.is_value = False if isinstance(select, list) else True self.select = select self.meta = Data(format="cube") # PUT EXTRA MARKUP HERE self.is_none = False if not all(data.values()): is_none = True # ENSURE frum IS PROPER FORM if isinstance(select, list): if edges and OR(not isinstance(v, Matrix) for v in data.values()): Log.error("Expecting data to be a dict with Matrix values") if not edges: if not data: if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix.ZERO} self.edges = FlatList.EMPTY elif isinstance(data, Mapping): # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA length = MAX([len(v) for v in data.values()]) if length >= 1: self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum"}}]) else: self.edges = FlatList.EMPTY elif isinstance(data, list): if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix.wrap(data)} self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}]) elif isinstance(data, Matrix): if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: data} else: if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix(value=data)} self.edges = FlatList.EMPTY else: self.edges = wrap(edges) self.data = data
def data(): dims = tuple(len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) if query.sort and not query.groupby: # TODO: USE THE format_table() TO PRODUCE THE NEEDED VALUES INSTEAD OF DUPLICATING LOGIC HERE all_coord = is_sent._all_combos() # TRACK THE EXPECTED COMBINATIONS for _, coord, agg in aggs_iterator(aggs, decoders): missing_coord = all_coord.next() while coord != missing_coord: # INSERT THE MISSING COORDINATE INTO THE GENERATION output = Data() for i, d in enumerate(decoders): output[query.edges[i].name] = d.get_value(missing_coord[i]) for s in select: if s.aggregate == "count": output[s.name] = 0 yield output missing_coord = all_coord.next() output = Data() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = s.pull(agg) yield output else: for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = Data() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = s.pull(agg) yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for c, v in is_sent: if not v: output = Data() for i, d in enumerate(decoders): output[query.edges[i].name] = d.get_value(c[i]) for s in select: if s.aggregate == "count": output[s.name] = 0 yield output
def format_cube(aggs, es_query, query, decoders, all_selects): new_edges = count_dim(aggs, es_query, decoders) dims = [] for e in new_edges: if isinstance(e.value, TupleOp): e.allowNulls = False extra = 0 if e.allowNulls is False else 1 dims.append(len(e.domain.partitions) + extra) dims = tuple(dims) if any(s.default != canonical_aggregates[s.aggregate].default for s in all_selects): # UNUSUAL DEFAULT VALUES MESS THE union() FUNCTION is_default = Matrix(dims=dims, zeros=True) matricies = {s.name: Matrix(dims=dims) for s in all_selects} for row, coord, agg, selects in aggs_iterator(aggs, es_query, decoders): for select in selects: m = matricies[select.name] v = select.pull(agg) if v == None: continue is_default[coord] = False union(m, coord, v, select.aggregate) # FILL THE DEFAULT VALUES for c, v in is_default: if v: for s in all_selects: matricies[s.name][c] = s.default else: matricies = { s.name: Matrix(dims=dims, zeros=s.default) for s in all_selects } for row, coord, agg, selects in aggs_iterator(aggs, es_query, decoders): for select in selects: m = matricies[select.name] v = select.pull(agg) union(m, coord, v, select.aggregate) cube = Cube( query.select, sort_using_key( new_edges, key=lambda e: e.dim), # ENSURE EDGES ARE IN SAME ORDER AS QUERY matricies) cube.frum = query return cube
def data(): dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) if query.sort and not query.groupby: all_coord = is_sent._all_combos( ) # TRACK THE EXPECTED COMBINATIONS for row, coord, agg in aggs_iterator(aggs, decoders): missing_coord = all_coord.next() while coord != missing_coord: record = [ d.get_value(missing_coord[i]) for i, d in enumerate(decoders) ] for s in select: if s.aggregate == "count": record.append(0) else: record.append(None) yield record missing_coord = all_coord.next() output = [d.get_value(c) for c, d in zip(coord, decoders)] for s in select: output.append(s.pull(agg)) yield output else: for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = [d.get_value(c) for c, d in zip(coord, decoders)] for s in select: output.append(s.pull(agg)) yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for c, v in is_sent: if not v: record = [ d.get_value(c[i]) for i, d in enumerate(decoders) ] for s in select: if s.aggregate == "count": record.append(0) else: record.append(None) yield record
def es_fieldop(es, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } } FromES.size = coalesce(query.limit, 200000) FromES.fields = FlatList() for s in select.value: if s == "*": FromES.fields = None elif isinstance(s, list): FromES.fields.extend(s) elif isinstance(s, Mapping): FromES.fields.extend(s.values()) else: FromES.fields.append(s) FromES.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort] data = es09.util.post(es, FromES, query.limit) T = data.hits.hits matricies = {} for s in select: if s.value == "*": matricies[s.name] = Matrix.wrap([t._source for t in T]) elif isinstance(s.value, Mapping): # for k, v in s.value.items(): # matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T]) matricies[s.name] = Matrix.wrap([{k: unwrap(t.fields).get(v, None) for k, v in s.value.items()}for t in T]) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T]) elif not s.value: matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T]) else: try: matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T]) except Exception as e: Log.error("", e) cube = Cube(query.select, query.edges, matricies, frum=query) cube.frum = query return cube
def data(): groupby = query.groupby dims = tuple(len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims) give_me_zeros = query.sort and not query.groupby finishes = [] # IRREGULAR DEFAULTS MESS WITH union(), SET THEM AT END, IF ANY for s in all_selects: if s.default != canonical_aggregates[s.aggregate].default: s.finish = s.default s.default = None finishes.append(s) for row, coord, agg, _selects in aggs_iterator(aggs, es_query, decoders, give_me_zeros=give_me_zeros): output = is_sent[coord] if output == None: output = is_sent[coord] = Data() for g, d, c in zip(groupby, decoders, coord): output[g.put.name] = d.get_value(c) for s in all_selects: output[s.name] = s.default yield output # THIS IS A TRICK! WE WILL UPDATE A ROW THAT WAS ALREADY YIELDED for s in _selects: union(output, s.name, s.pull(agg), s.aggregate) if finishes: # SET ANY DEFAULTS for c, o in is_sent: for s in finishes: if o[s.name] == None: o[s.name] = s.finish
def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{ "index": i, "value": p } for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = FlatList() for c, s in enumerate(select): data = Matrix(*[ len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges ]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0]
def es_countop(es, mvel, query): """ RETURN SINGLE COUNT """ select = listwrap(query.select) FromES = build_es_query(query) for s in select: if is_keyword(s.value): FromES.facets[s.name] = { "terms": { "field": s.value, "size": query.limit, }, "facet_filter":{"exists":{"field":s.value}} } else: # COMPLICATED value IS PROBABLY A SCRIPT, USE IT FromES.facets[s.name] = { "terms": { "script_field": es09.expressions.compile_expression(s.value, query), "size": 200000 } } data = es09.util.post(es, FromES, query.limit) matricies = {} for s in select: matricies[s.name] = Matrix(value=data.hits.facets[s.name].total) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def data(): dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = [d.get_value(c) for c, d in zip(coord, decoders)] for s in select: output.append(_pull(s, agg)) yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for c, v in is_sent: if not v: record = [ d.get_value(c[i]) for i, d in enumerate(decoders) ] for s in select: if s.aggregate == "count": record.append(0) else: record.append(None) yield record
def format_cube(decoders, aggs, start, query, select): new_edges = count_dim(aggs, decoders) dims = [] for e in new_edges: if isinstance(e.value, TupleOp): e.allowNulls = False if e.allowNulls is False: extra = 0 else: extra = 1 dims.append(len(e.domain.partitions)+extra) dims = tuple(dims) matricies = [(s, Matrix(dims=dims, zeros=s.default)) for s in select] for row, coord, agg in aggs_iterator(aggs, decoders): for s, m in matricies: try: v = _pull(s, agg) m[coord] = v except Exception as e: Log.error("", e) cube = Cube(query.select, new_edges, {s.name: m for s, m in matricies}) cube.frum = query return cube
def format_cube(decoders, aggs, start, query, select): # decoders = sorted(decoders, key=lambda d: -d.edge.dim) # REVERSE DECODER ORDER, BECAUSE ES QUERY WAS BUILT IN REVERSE ORDER new_edges = count_dim(aggs, decoders) dims = [] for e in new_edges: if isinstance(e.value, TupleOp): e.allowNulls = False extra = 0 if e.allowNulls is False else 1 dims.append(len(e.domain.partitions) + extra) dims = tuple(dims) matricies = [(s, Matrix(dims=dims, zeros=s.default)) for s in select] for row, coord, agg in aggs_iterator(aggs, decoders): for s, m in matricies: try: v = _pull(s, agg) m[coord] = v except Exception as e: Log.error("", e) cube = Cube( query.select, sorted(new_edges, key=lambda e: e.dim), # ENSURE EDGES ARE IN SAME ORDER AS QUERY {s.name: m for s, m in matricies}) cube.frum = query return cube
def window(self, window): if window.edges or window.sort: raise NotImplementedError() from jx_python import jx # SET OP canonical = self.data.values()[0] accessor = jx.get(window.value) cnames = self.data.keys() # ANNOTATE EXISTING CUBE WITH NEW COLUMN m = self.data[window.name] = Matrix(dims=canonical.dims) for coord in canonical._all_combos(): row = Data( ) # IT IS SAD WE MUST HAVE A Data(), THERE ARE {"script": expression} USING THE DOT NOTATION for k in cnames: row[k] = self.data[k][coord] for c, e in zip(coord, self.edges): row[e.name] = e.domain.partitions[c] m[coord] = accessor( row, Null, Null) # DUMMY Null VALUES BECAUSE I DO NOT KNOW WHAT TO DO self.select.append(window) return self
def data(): dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = Data() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = _pull(s, agg) yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for c, v in is_sent: if not v: output = Data() for i, d in enumerate(decoders): output[query.edges[i].name] = d.get_value(c[i]) for s in select: if s.aggregate == "count": output[s.name] = 0 yield output
def format_cube(decoders, aggs, start, query, select): # decoders = sorted(decoders, key=lambda d: -d.edge.dim) # REVERSE DECODER ORDER, BECAUSE ES QUERY WAS BUILT IN REVERSE ORDER new_edges = count_dim(aggs, decoders) dims = [] for e in new_edges: if isinstance(e.value, TupleOp): e.allowNulls = False extra = 0 if e.allowNulls is False else 1 dims.append(len(e.domain.partitions) + extra) dims = tuple(dims) matricies = [(s, Matrix(dims=dims, zeros=s.default)) for s in select] for row, coord, agg in aggs_iterator(aggs, decoders): for s, m in matricies: try: v = s.pull(agg) m[coord] = v except Exception as e: # THIS HAPPENS WHEN ES RETURNS MORE TUPLE COMBINATIONS THAN DOCUMENTS if agg.get('doc_count') != 0: Log.error("Programmer error", cause=e) cube = Cube( query.select, sort_using_key( new_edges, key=lambda e: e.dim), # ENSURE EDGES ARE IN SAME ORDER AS QUERY {s.name: m for s, m in matricies}) cube.frum = query return cube
def __getitem__(self, item): # TODO: SOLVE FUNDAMENTAL QUESTION OF IF SELECTING A PART OF AN # EDGE REMOVES THAT EDGE FROM THIS RESULT, OR ADDS THE PART # AS A select {"name":edge.name, "value":edge.domain.partitions[coord]} # PROBABLY NOT, THE value IS IDENTICAL OVER THE REMAINING if is_data(item): coordinates = [None] * len(self.edges) # MAP DICT TO NUMERIC INDICES for name, v in item.items(): ei, parts = first((i, e.domain.partitions) for i, e in enumerate(self.edges) if e.name == name) if not parts: Log.error( "Can not find {{name}}=={{value|quote}} in list of edges, maybe this feature is not implemented yet", name=name, value=v) part = first(p for p in parts if p.value == v) if not part: return Null else: coordinates[ei] = part.dataIndex edges = [e for e, v in zip(self.edges, coordinates) if v is None] if not edges: # ZERO DIMENSIONAL VALUE return dict_to_data({ k: v.__getitem__(coordinates) for k, v in self.data.items() }) else: output = Cube(select=self.select, edges=list_to_data([ e for e, v in zip(self.edges, coordinates) if v is None ]), data={ k: Matrix(values=c.__getitem__(coordinates)) for k, c in self.data.items() }) return output elif is_text(item): # RETURN A VALUE CUBE if self.is_value: if item != self.select.name: Log.error("{{name}} not found in cube", name=item) return self if item not in self.select.name: Log.error("{{name}} not found in cube", name=item) output = Cube(select=first(s for s in self.select if s.name == item), edges=self.edges, data={item: self.data[item]}) return output else: Log.error("not implemented yet")
def format_cube_from_aggop(decoders, aggs, start, query, select): agg = drill(aggs) matricies = [(s, Matrix(dims=[], zeros=s.default)) for s in select] for s, m in matricies: m[tuple()] = _pull(s, agg) cube = Cube(query.select, [], {s.name: m for s, m in matricies}) cube.frum = query return cube
def _select(self, select): selects = listwrap(select) is_aggregate = OR(s.aggregate != None and s.aggregate != "none" for s in selects) if is_aggregate: values = {s.name: Matrix(value=self.data[s.value].aggregate(s.aggregate)) for s in selects} return Cube(select, [], values) else: values = {s.name: self.data[s.value] for s in selects} return Cube(select, self.edges, values)
def data(): dims = tuple(len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) if query.sort and not query.groupby: all_coord = is_sent._all_combos() # TRACK THE EXPECTED COMBINATIONS for row, coord, agg in aggs_iterator(aggs, decoders): missing_coord = all_coord.next() while coord != missing_coord: record = [d.get_value(missing_coord[i]) for i, d in enumerate(decoders)] for s in select: if s.aggregate == "count": record.append(0) else: record.append(None) yield record missing_coord = all_coord.next() output = [d.get_value(c) for c, d in zip(coord, decoders)] for s in select: output.append(s.pull(agg)) yield output else: for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = [d.get_value(c) for c, d in zip(coord, decoders)] for s in select: output.append(s.pull(agg)) yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for c, v in is_sent: if not v: record = [d.get_value(c[i]) for i, d in enumerate(decoders)] for s in select: if s.aggregate == "count": record.append(0) else: record.append(None) yield record
def _es_terms2(es, mvel, query): """ WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value """ # REQUEST VALUES IN FIRST DIMENSION q1 = query.copy() q1.edges = query.edges[0:1:] values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value select = listwrap(query.select) FromES = build_es_query(query) for s in select: for i, v in enumerate(values1): FromES.facets[s.name + "," + str(i)] = { "terms": { "field": query.edges[1].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter({"and": [ query.where, {"term": {query.edges[0].value: v}} ]}) } data = es_post(es, FromES, query.limit) # UNION ALL TERMS FROM SECOND DIMENSION values2 = set() for k, f in data.facets.items(): values2.update(f.terms.term) values2 = jx.sort(values2) term2index = {v: i for i, v in enumerate(values2)} query.edges[1].domain.partitions = FlatList([{"name": v, "value": v} for v in values2]) # MAKE CUBE output = {} dims = [len(values1), len(values2)] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): coord = facetName.split(",") s = [s for s in select if s.name == coord[0]][0] i1 = int(coord[1]) for term in facet.terms: i2 = term2index[term.term] output[s.name][(i1, i2)] = term[aggregates[s.aggregate]] cube = Cube(query.select, query.edges, output) cube.query = query return cube
def __init__(self, select, edges, data, frum=None): """ data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE ALLOWED, USING THE select AND edges TO DESCRIBE THE data """ self.is_value = False if is_list(select) else True self.select = select self.meta = Data(format="cube") # PUT EXTRA MARKUP HERE self.is_none = False if not all(data.values()): is_none = True # ENSURE frum IS PROPER FORM if is_list(select): if edges and OR(not isinstance(v, Matrix) for v in data.values()): Log.error("Expecting data to be a dict with Matrix values") if not edges: if not data: if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix.ZERO} self.edges = FlatList.EMPTY elif is_data(data): # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA length = MAX([len(v) for v in data.values()]) if length >= 1: self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum"}}]) else: self.edges = FlatList.EMPTY elif is_list(data): if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix.wrap(data)} self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}]) elif isinstance(data, Matrix): if is_list(select): Log.error("not expecting a list of records") data = {select.name: data} else: if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix(value=data)} self.edges = FlatList.EMPTY else: self.edges = wrap(edges) self.data = data
def es_deepop(es, mvel, query): FromES = es09.util.build_es_query(query) select = query.edges temp_query = query.copy() temp_query.select = select temp_query.edges = FlatList() FromES.facets.mvel = { "terms": { "script_field": mvel.code(temp_query), "size": query.limit }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) rows = unpack_terms(data.facets.mvel, query.edges) terms = zip(*rows) # NUMBER ALL EDGES FOR JSON EXPRESSION INDEXING edges = query.edges for f, e in enumerate(edges): for r in terms[f]: e.domain.getPartByKey(r) e.index = f for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE dims = [len(e.domain.partitions) for e in query.edges] output = Matrix(*dims) # FILL CUBE for r in rows: term_coord = [ e.domain.getPartByKey(r[i]).dataIndex for i, e in enumerate(edges) ] output[term_coord] = SUM(output[term_coord], r[-1]) cube = Cube(query.select, query.edges, {query.select.name: output}) cube.frum = query return cube
def es_aggop(es, mvel, query): select = listwrap(query.select) FromES = build_es_query(query) isSimple = AND(aggregates[s.aggregate] == "count" for s in select) if isSimple: return es_countop(es, query) # SIMPLE, USE TERMS FACET INSTEAD value2facet = dict() # ONLY ONE FACET NEEDED PER name2facet = dict() # MAP name TO FACET WITH STATS for s in select: if s.value not in value2facet: if isinstance(s.value, Variable): unwrap(FromES.facets)[s.name] = { "statistical": { "field": s.value.var }, "facet_filter": simplify_esfilter(query.where.to_esfilter()) } else: unwrap(FromES.facets)[s.name] = { "statistical": { "script": jx_expression_to_function(s.value) }, "facet_filter": simplify_esfilter(query.where) } value2facet[s.value] = s.name name2facet[s.name] = value2facet[s.value] data = es09.util.post(es, FromES, query.limit) matricies = { s.name: Matrix(value=fix_es_stats(data.facets[literal_field(s.name)])[ aggregates[s.aggregate]]) for s in select } cube = Cube(query.select, [], matricies) cube.frum = query return cube
def data(): groupby = query.groupby dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims) for row, coord, agg, _selects in aggs_iterator( aggs, es_query, decoders, give_me_zeros=(query.sort and not query.groupby)): output = is_sent[coord] if output == None: output = is_sent[coord] = Data() for g, d, c in zip(groupby, decoders, coord): output[g.put.name] = d.get_value(c) for s in all_selects: output[s.name] = None yield output # THIS IS A TRICK! WE WILL UPDATE A ROW THAT WAS ALREADY YIELDED for s in _selects: union(output, s.name, s.pull(agg), s.aggregate)
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = FlatList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error("There is more than one open-ended edge: self can not be handled") specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_keyword(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges)*len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": {"aggregate": "count"}, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note("{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count= len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, FlatList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error("not implemented yet") # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = FlatList() constants = FlatList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({"name": fedge.domain.name, "value": parts[f]}) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_keyword(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_keyword(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter({"and": condition}) data = es09.util.post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = FlatList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index]+(special.dataIndex, )+pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not startswith_field(query['from'], self.sf.fact): Log.error("Expecting table, or some nested table") frum, query['from'] = query['from'], self table = self.sf.tables[relative_field(frum, self.sf.fact)] schema = table.schema query = QueryOp.wrap(query, table=table, schema=schema) new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_column(new_table) + " AS " else: create_table = "" if query.groupby and query.format != "cube": op, index_to_columns = self._groupby_op(query, frum) command = create_table + op elif query.groupby: query.edges, query.groupby = query.groupby, query.edges op, index_to_columns = self._edges_op(query, frum) command = create_table + op query.edges, query.groupby = query.groupby, query.edges elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, frum) command = create_table + op else: op = self._set_op(query, frum) return op result = self.db.query(command) if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap(s.pull(result.data[0])) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( data=unwrap(data), select=select, meta={"format": "cube"} ) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif isinstance(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data[s.name] = Matrix(dims=dims, zeros=0) else: data[s.name] = Matrix(dims=dims) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()} ) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif isinstance(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = zip(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} if query.sort[i].sort == -1: domain = SimpleSetDomain(partitions=wrap(sorted(parts, reverse=True))) else: domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data_cubes = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data_cubes[s.name] = Matrix(dims=dims, zeros=0) else: data_cubes[s.name] = Matrix(dims=dims) r2c = index_to_coordinate(dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull(row) if query.select == None: select = Null elif isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()} ) elif query.format == "table" or (not query.format and query.groupby): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data( meta={"format": "table"}, header=column_names, data=data ) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any(listwrap(query.select).aggregate): if isinstance(query.select, list): data = Data() for c in index_to_columns.values(): if c.push_child == ".": if data[c.push_name] == None: data[c.push_name] = c.pull(result.data[0]) elif isinstance(data[c.push_name], list): data[c.push_name].append(c.pull(result.data[0])) else: data[c.push_name] = [data[c.push_name], c.pull(result.data[0])] else: data[c.push_name][c.push_child] = c.pull(result.data[0]) output = Data( meta={"format": "value"}, data=data ) else: data = Data() for s in index_to_columns.values(): if not data[s.push_child]: data[s.push_child] = s.pull(result.data[0]) else: data[s.push_child] += [s.pull(result.data[0])] output = Data( meta={"format": "value"}, data=unwrap(data) ) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data( meta={"format": "list"}, data=data ) else: Log.error("unknown format {{format}}", format=query.format) return output
def data(): is_sent = Matrix(dims=dims) give_me_zeros = query.sort and not query.groupby if give_me_zeros: # WE REQUIRE THE ZEROS FOR SORTING all_coord = is_sent._all_combos( ) # TRACK THE EXPECTED COMBINATIONS ordered_coord = all_coord.next()[::-1] output = None for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders): if coord != ordered_coord: # output HAS BEEN YIELDED, BUT SET THE DEFAULT VALUES if output is not None: for s in all_selects: i = name2index[s.name] if output[i] is None: output[i] = s.default # WE CAN GET THE SAME coord MANY TIMES, SO ONLY ADVANCE WHEN NOT ordered_coord = all_coord.next()[::-1] while coord != ordered_coord: # HAPPENS WHEN THE coord IS AHEAD OF ordered_coord record = [ d.get_value(ordered_coord[i]) for i, d in enumerate(decoders) ] + [s.default for s in all_selects] yield record ordered_coord = all_coord.next()[::-1] # coord == missing_coord output = [d.get_value(c) for c, d in zip(coord, decoders) ] + [None for s in all_selects] for select in ss: v = select.pull(agg) if v != None: union(output, name2index[select.name], v, select.aggregate) yield output else: last_coord = None # HANG ONTO THE output FOR A BIT WHILE WE FILL THE ELEMENTS output = None for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders): if coord != last_coord: if output: # SET DEFAULTS for i, s in enumerate(all_selects): v = output[rank + i] if v == None: output[rank + i] = s.default yield output output = is_sent[coord] if output == None: output = is_sent[coord] = [ d.get_value(c) for c, d in zip(coord, decoders) ] + [None for _ in all_selects] last_coord = coord # THIS IS A TRICK! WE WILL UPDATE A ROW THAT WAS ALREADY YIELDED for select in ss: v = select.pull(agg) if v != None: union(output, name2index[select.name], v, select.aggregate) if output: # SET DEFAULTS ON LAST ROW for i, s in enumerate(all_selects): v = output[rank + i] if v == None: output[rank + i] = s.default yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for coord, output in is_sent: if output == None: record = [ d.get_value(c) for c, d in zip(coord, decoders) ] + [s.default for s in all_selects] yield record
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field(query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([s.value == None and s.aggregate not in ("count", "none") for s in select]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and not select[0].value or select[0].value == "*": FromES = wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) }}, "sort": query.sort, "size": 1 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": simplify_esfilter(query.where.to_esfilter()) }}, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1 and not select[0].value or select[0].value == "*": # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube(select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Data( meta={"esquery": FromES}, data=cube )
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not startswith_field(query['from'], self.name): Log.error("Expecting table, or some nested table") frum, query['from'] = query['from'], self query = QueryOp.wrap(query, self.columns) # TYPE CONFLICTS MUST NOW BE RESOLVED DURING # TYPE-SPECIFIC QUERY NORMALIZATION # vars_ = query.vars(exclude_select=True) # type_map = { # v: c.es_column # for v in vars_ # if v in self.columns and len([c for c in self.columns[v] if c.type != "nested"]) == 1 # for c in self.columns[v] # if c.type != "nested" # } # # sql_query = query.map(type_map) query = query new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_table(new_table) + " AS " else: create_table = "" if query.groupby: op, index_to_columns = self._groupby_op(query, frum) command = create_table + op elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, frum) command = create_table + op else: op = self._set_op(query, frum) return op if query.sort: command += "\nORDER BY " + ",\n".join( "(" + sql[t] + ") IS NULL" + (" DESC" if s.sort == -1 else "") + ",\n" + sql[t] + (" DESC" if s.sort == -1 else "") for s, sql in [(s, s.value.to_sql(self)[0].sql) for s in query.sort] for t in "bns" if sql[t]) result = self.db.query(command) column_names = query.edges.name + query.groupby.name + listwrap( query.select).name if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap( s.pull(result.data[0])) return Data(data=unwrap(data), meta={"format": "cube"}) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain( partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif isinstance(e.value, TupleOp): pulls = jx.sort([ c for c in index_to_columns.values() if c.push_name == e.name ], "push_child").pull parts = [ tuple(p(d) for p in pulls) for d in result.data ] domain = SimpleSetDomain( partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append( Data(name=e.name, allowNulls=allowNulls, domain=domain)) zeros = [ 0 if s.aggregate == "count" and index_to_columns[si].push_child == "." else Data for si, s in enumerate(listwrap(query.select)) ] data = { s.name: Matrix(dims=dims, zeros=zeros[si]) for si, s in enumerate(listwrap(query.select)) } if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()}) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain( partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif isinstance(e.value, TupleOp): pulls = jx.sort([ c for c in index_to_columns.values() if c.push_name == e.name ], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = zip(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append( Data(name=e.name, allowNulls=allowNulls, domain=domain)) zeros = [ 0 if s.aggregate == "count" and index_to_columns[si].push_child == "." else Data for si, s in enumerate(listwrap(query.select)) ] data_cubes = { s.name: Matrix(dims=dims, zeros=zeros[si]) for si, s in enumerate(listwrap(query.select)) } r2c = index_to_coordinate( dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull( row) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()}) elif query.format == "table" or (not query.format and query.groupby): data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[ s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data(meta={"format": "table"}, header=column_names, data=data) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any( listwrap(query.select).aggregate): if isinstance(query.select, list): data = Data() for c in index_to_columns.values(): if c.push_child == ".": data[c.push_name] = c.pull(result.data[0]) else: data[c.push_name][c.push_child] = c.pull( result.data[0]) output = Data(meta={"format": "value"}, data=data) else: data = Data() for s in index_to_columns.values(): data[s.push_child] = s.pull(result.data[0]) output = Data(meta={"format": "value"}, data=unwrap(data)) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[ c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data(meta={"format": "list"}, data=data) else: Log.error("unknown format {{format}}", format=query.format) return output
def es_terms(es, mvel, query): """ RETURN LIST OF ALL EDGE QUERIES EVERY FACET IS NAMED <select.name>, <c1>, ... <cN> WHERE <ci> ARE THE ELEMENT COORDINATES WE TRY TO PACK DIMENSIONS INTO THE TERMS TO MINIMIZE THE CROSS-PRODUCT EXPLOSION """ if len(query.edges) == 2: return _es_terms2(es, mvel, query) select = listwrap(query.select) FromES = build_es_query(query) packed_term = compileEdges2Term(mvel, query.edges, wrap([])) for s in select: FromES.facets[s.name] = { "terms": { "field": packed_term.field, "script_field": packed_term.expression, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } term2Parts = packed_term.term2parts data = es_post(es, FromES, query.limit) # GETTING ALL PARTS WILL EXPAND THE EDGES' DOMAINS # BUT HOW TO UNPACK IT FROM THE term FASTER IS UNKNOWN for k, f in data.facets.items(): for t in f.terms: term2Parts(t.term) # NUMBER ALL EDGES FOR jx INDEXING for f, e in enumerate(query.edges): e.index = f if e.domain.type in ["uid", "default"]: # e.domain.partitions = jx.sort(e.domain.partitions, "value") for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE output = {} dims = [ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): for term in facet.terms: term_coord = term2Parts(term.term).dataIndex for s in select: try: output[s.name][term_coord] = term[aggregates[s.aggregate]] except Exception as e: # USUALLY CAUSED BY output[s.name] NOT BEING BIG ENOUGH TO HANDLE NULL COUNTS pass cube = Cube(query.select, query.edges, output) cube.query = query return cube
def data(): is_sent = Matrix(dims=dims) give_me_zeros = query.sort and not query.groupby if give_me_zeros: # WE REQUIRE THE ZEROS FOR SORTING all_coord = is_sent._all_combos() # TRACK THE EXPECTED COMBINATIONS ordered_coord = all_coord.next()[::-1] output = None for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders): if coord != ordered_coord: # output HAS BEEN YIELDED, BUT SET THE DEFAULT VALUES if output is not None: for s in all_selects: i = name2index[s.name] if output[i] is None: output[i] = s.default # WE CAN GET THE SAME coord MANY TIMES, SO ONLY ADVANCE WHEN NOT ordered_coord = all_coord.next()[::-1] while coord != ordered_coord: # HAPPENS WHEN THE coord IS AHEAD OF ordered_coord record = [d.get_value(ordered_coord[i]) for i, d in enumerate(decoders)] + [s.default for s in all_selects] yield record ordered_coord = all_coord.next()[::-1] # coord == missing_coord output = [d.get_value(c) for c, d in zip(coord, decoders)] + [None for s in all_selects] for select in ss: v = select.pull(agg) if v != None: union(output, name2index[select.name], v, select.aggregate) yield output else: last_coord = None # HANG ONTO THE output FOR A BIT WHILE WE FILL THE ELEMENTS output = None for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders): if coord != last_coord: if output: # SET DEFAULTS for i, s in enumerate(all_selects): v = output[rank+i] if v == None: output[rank+i] = s.default yield output output = is_sent[coord] if output == None: output = is_sent[coord] = [d.get_value(c) for c, d in zip(coord, decoders)] + [None for _ in all_selects] last_coord = coord # THIS IS A TRICK! WE WILL UPDATE A ROW THAT WAS ALREADY YIELDED for select in ss: v = select.pull(agg) if v != None: union(output, name2index[select.name], v, select.aggregate) if output: # SET DEFAULTS ON LAST ROW for i, s in enumerate(all_selects): v = output[rank+i] if v == None: output[rank+i] = s.default yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for coord, output in is_sent: if output == None: record = [d.get_value(c) for c, d in zip(coord, decoders)] + [s.default for s in all_selects] yield record
def list_aggs(frum, query): frum = wrap(frum) select = listwrap(query.select) for e in query.edges: if isinstance(e.domain, DefaultDomain): accessor = jx_expression_to_function(e.value) unique_values = set(map(accessor, frum)) if None in unique_values: e.allowNulls = coalesce(e.allowNulls, True) unique_values -= {None} e.domain = SimpleSetDomain(partitions=list(sorted(unique_values))) else: pass s_accessors = [(ss.name, compile_expression(ss.value.to_python())) for ss in select] result = { s.name: Matrix(dims=[ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ], zeros=lambda: windows.name2accumulator.get(s.aggregate) (**s)) for s in select } where = jx_expression_to_function(query.where) coord = [None] * len(query.edges) edge_accessor = [(i, make_accessor(e)) for i, e in enumerate(query.edges)] net_new_edge_names = set(wrap(query.edges).name) - UNION( e.value.vars() for e in query.edges) if net_new_edge_names & UNION(ss.value.vars() for ss in select): # s_accessor NEEDS THESE EDGES, SO WE PASS THEM ANYWAY for d in filter(where, frum): d = d.copy() for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] for e, cc in zip(query.edges, c): d[e.name] = e.domain.partitions[cc] val = s_accessor(d, c, frum) acc.add(val) else: # FASTER for d in filter(where, frum): for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] val = s_accessor(d, c, frum) acc.add(val) for s in select: # if s.aggregate == "count": # continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() from jx_python.containers.cube import Cube output = Cube(select, query.edges, result) return output
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([ s.value == None and s.aggregate not in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and isinstance(select[0].value, LeavesOp): FromES = wrap({ "query": { "bool": { "query": { "match_all": {} }, "filter": query.where.to_esfilter() } }, "sort": query.sort, "size": 0 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": { "bool": { "query": { "match_all": {} }, "filter": query.where.to_esfilter() } }, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": jx_expression(query.where).to_esfilter() } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": jx_expression(query.where).to_esfilter() } data = es_post(es, FromES, query.limit) if len(select) == 1 and isinstance(select[0].value, LeavesOp): # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = transpose(*data_list) cube = Cube( select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Data(meta={"esquery": FromES}, data=cube)
def cube_aggs(frum, query): select = listwrap(query.select) #MATCH EDGES IN QUERY TO ONES IN frum for e in query.edges: for fs in frum.select: if fs.name == e.value: Log.error("Not implemented yet") if isinstance(e.domain, DefaultDomain): # DEFAULT DOMAINS CAN EASILY BE LOOKED UP FROM frum for fe in frum.edges: if fe.name == e.value: e.domain = SimpleSetDomain(**fe.domain.__data__()) e.value = e.value + "." + fe.domain.key break else: for fe in frum.edges: if fe.name == e.value: e.value = e.value + "." + fe.domain.key break result = { s.name: Matrix(dims=[ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ], zeros=s.default) for s in select } where = jx_expression_to_function(query.where) for d in filter(where, frum.values()): coord = [ ] # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE for e in query.edges: matches = get_matches(e, d) coord.append(matches) if len(matches) == 1 and d[e.name] == None: d[e.name] = e.domain.partitions[matches[0]] for s in select: mat = result[s.name] agg = s.aggregate var = s.value expr = jx_expression_to_function(var) val = expr(d) if agg == "count": if var == "." or var == None: for c in itertools.product(*coord): mat[c] += 1 continue if val != None: for c in itertools.product(*coord): mat[c] += 1 else: for c in itertools.product(*coord): acc = mat[c] if acc == None: acc = windows.name2accumulator.get(agg) if acc == None: Log.error( "select aggregate {{agg}} is not recognized", agg=agg) acc = acc(**s) mat[c] = acc acc.add(val) for s in select: if s.aggregate == "count": continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() from jx_python.containers.cube import Cube return Cube(select, query.edges, result)