def format_cube(T, select, query=None): table = format_table(T, select, query) if len(table.data) == 0: return Cube( select, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": 0, "interval": 1 } }], data={h: Matrix(list=[]) for i, h in enumerate(table.header)}) cols = zip(*unwrap(table.data)) return Cube( select, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": len(table.data), "interval": 1 } }], data={h: Matrix(list=cols[i]) for i, h in enumerate(table.header)})
def es_fieldop(es, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } } FromES.size = coalesce(query.limit, 200000) FromES.fields = DictList() for s in select.value: if s == "*": FromES.fields = None elif isinstance(s, list): FromES.fields.extend(s) elif isinstance(s, Mapping): FromES.fields.extend(s.values()) else: FromES.fields.append(s) FromES.sort = [{ s.field: "asc" if s.sort >= 0 else "desc" } for s in query.sort] data = es09.util.post(es, FromES, query.limit) T = data.hits.hits matricies = {} for s in select: if s.value == "*": matricies[s.name] = Matrix.wrap([t._source for t in T]) elif isinstance(s.value, Mapping): # for k, v in s.value.items(): # matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T]) matricies[s.name] = Matrix.wrap([{ k: unwrap(t.fields).get(v, None) for k, v in s.value.items() } for t in T]) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([ tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T ]) elif not s.value: matricies[s.name] = Matrix.wrap( [unwrap(t.fields).get(s.value, None) for t in T]) else: try: matricies[s.name] = Matrix.wrap( [unwrap(t.fields).get(s.value, None) for t in T]) except Exception, e: Log.error("", e)
def __init__(self, select, edges, data, frum=None): """ data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE ALLOWED, USING THE select AND edges TO DESCRIBE THE data """ self.is_value = False if isinstance(select, list) else True self.select = select self.meta = Dict(format="cube") # PUT EXTRA MARKUP HERE self.is_none = False if not all(data.values()): is_none = True # ENSURE frum IS PROPER FORM if isinstance(select, list): if edges and OR(not isinstance(v, Matrix) for v in data.values()): Log.error("Expecting data to be a dict with Matrix values") if not edges: if not data: if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix.ZERO} self.edges = DictList.EMPTY elif isinstance(data, Mapping): # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA length = MAX([len(v) for v in data.values()]) if length >= 1: self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum"}}]) else: self.edges = DictList.EMPTY elif isinstance(data, list): if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix.wrap(data)} self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}]) elif isinstance(data, Matrix): if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: data} else: if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix(value=data)} self.edges = DictList.EMPTY else: self.edges = wrap(edges) self.data = data
def window(self, window): if window.edges or window.sort: Log.error("not implemented") from pyLibrary.queries import jx # SET OP canonical = self.data.values()[0] accessor = jx.get(window.value) cnames = self.data.keys() # ANNOTATE EXISTING CUBE WITH NEW COLUMN m = self.data[window.name] = Matrix(dims=canonical.dims) for coord in canonical._all_combos(): row = Dict( ) # IT IS SAD WE MUST HAVE A Dict(), THERE ARE {"script": expression} USING THE DOT NOTATION for k in cnames: row[k] = self.data[k][coord] for c, e in zip(coord, self.edges): row[e.name] = e.domain.partitions[c] m[coord] = accessor( row, Null, Null) # DUMMY Null VALUES BECAUSE I DO NOT KNOW WHAT TO DO self.select.append(window) return self
def data(): dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = [d.get_value(c) for c, d in zip(coord, decoders)] for s in select: output.append(_pull(s, agg)) yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for c, v in is_sent: if not v: record = [ d.get_value(c[i]) for i, d in enumerate(decoders) ] for s in select: if s.aggregate == "count": record.append(0) else: record.append(None) yield record
def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{ "index": i, "value": p } for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = DictList() for c, s in enumerate(select): data = Matrix(*[ len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges ]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0]
def data(): dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = Dict() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = _pull(s, agg) yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for c, v in is_sent: if not v: output = Dict() for i, d in enumerate(decoders): output[query.edges[i].name] = d.get_value(c[i]) for s in select: if s.aggregate == "count": output[s.name] = 0 yield output
def format_cube_from_aggop(decoders, aggs, start, query, select): agg = drill(aggs) matricies = [(s, Matrix(dims=[], zeros=s.default)) for s in select] for s, m in matricies: m[tuple()] = _pull(s, agg) cube = Cube(query.select, [], {s.name: m for s, m in matricies}) cube.frum = query return cube
def __getitem__(self, item): # TODO: SOLVE FUNDAMENTAL QUESTION OF IF SELECTING A PART OF AN # EDGE REMOVES THAT EDGE FROM THIS RESULT, OR ADDS THE PART # AS A select {"name":edge.name, "value":edge.domain.partitions[coord]} # PROBABLY NOT, THE value IS IDENTICAL OVER THE REMAINING if isinstance(item, Mapping): coordinates = [None] * len(self.edges) # MAP DICT TO NUMERIC INDICES for name, v in item.items(): ei, parts = wrap([(i, e.domain.partitions) for i, e in enumerate(self.edges) if e.name == name])[0] if not parts: Log.error( "Can not find {{name}}=={{value|quote}} in list of edges, maybe this feature is not implemented yet", name=name, value=v) part = wrap([p for p in parts if p.value == v])[0] if not part: return Null else: coordinates[ei] = part.dataIndex edges = [e for e, v in zip(self.edges, coordinates) if v is None] if not edges: # ZERO DIMENSIONAL VALUE return wrap({ k: v.__getitem__(coordinates) for k, v in self.data.items() }) else: output = Cube(select=self.select, edges=wrap([ e for e, v in zip(self.edges, coordinates) if v is None ]), data={ k: Matrix(values=c.__getitem__(coordinates)) for k, c in self.data.items() }) return output elif isinstance(item, basestring): # RETURN A VALUE CUBE if self.is_value: if item != self.select.name: Log.error("{{name}} not found in cube", name=item) return self if item not in self.select.name: Log.error("{{name}} not found in cube", name=item) output = Cube(select=[s for s in self.select if s.name == item][0], edges=self.edges, data={item: self.data[item]}) return output else: Log.error("not implemented yet")
def _select(self, select): selects = listwrap(select) is_aggregate = OR(s.aggregate != None and s.aggregate != "none" for s in selects) if is_aggregate: values = {s.name: Matrix(value=self.data[s.value].aggregate(s.aggregate)) for s in selects} return Cube(select, [], values) else: values = {s.name: self.data[s.value] for s in selects} return Cube(select, self.edges, values)
def format_cube(T, select, source): matricies = {} for s in select: try: if s.value == ".": matricies[s.name] = Matrix.wrap(T.select(source)) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([tuple(unwraplist(t[source][ss]) for ss in s.value) for t in T]) else: if source == "_source": matricies[s.name] = Matrix.wrap([unwraplist(t[source][s.value]) for t in T]) elif isinstance(s.value, basestring): # fields matricies[s.name] = Matrix.wrap([unwraplist(t[source].get(s.value)) for t in T]) else: matricies[s.name] = Matrix.wrap([unwraplist(t[source].get(s.name)) for t in T]) except Exception, e: Log.error("", e)
def es_fieldop(es, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(query.where) } } FromES.size = coalesce(query.limit, 200000) FromES.fields = DictList() for s in select.value: if s == "*": FromES.fields = None elif isinstance(s, list): FromES.fields.extend(s) elif isinstance(s, Mapping): FromES.fields.extend(s.values()) else: FromES.fields.append(s) FromES.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort] data = es09.util.post(es, FromES, query.limit) T = data.hits.hits matricies = {} for s in select: if s.value == "*": matricies[s.name] = Matrix.wrap([t._source for t in T]) elif isinstance(s.value, Mapping): # for k, v in s.value.items(): # matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T]) matricies[s.name] = Matrix.wrap([{k: unwrap(t.fields).get(v, None) for k, v in s.value.items()}for t in T]) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T]) elif not s.value: matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T]) else: try: matricies[s.name] = Matrix.wrap([unwrap(t.fields).get(s.value, None) for t in T]) except Exception, e: Log.error("", e)
def __init__(self, select, edges, data, frum=None): """ data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE ALLOWED, USING THE select AND edges TO DESCRIBE THE data """ self.is_value = False if isinstance(select, list) else True self.select = select self.meta = Dict(format="cube") # PUT EXTRA MARKUP HERE self.is_none = False if not all(data.values()): is_none = True # ENSURE frum IS PROPER FORM if isinstance(select, list): if edges and OR(not isinstance(v, Matrix) for v in data.values()): Log.error("Expecting data to be a dict with Matrix values") if not edges: if not data: if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix.ZERO} self.edges = DictList.EMPTY elif isinstance(data, Mapping): # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA length = MAX([len(v) for v in data.values()]) if length >= 1: self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum"}}]) else: self.edges = DictList.EMPTY elif isinstance(data, list): if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix.wrap(data)} self.edges = wrap( [{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}] ) elif isinstance(data, Matrix): if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: data} else: if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix(value=data)} self.edges = DictList.EMPTY else: self.edges = wrap(edges) self.data = data
def data(): dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = [d.get_value(c) for c, d in zip(coord, decoders)] for s in select: output.append(_pull(s, agg)) yield output
def format_cube(decoders, aggs, start, query, select): new_edges = count_dim(aggs, decoders) dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) matricies = [(s, Matrix(dims=dims, zeros=s.default)) for s in select] for row, coord, agg in aggs_iterator(aggs, decoders): for s, m in matricies: try: v = _pull(s, agg) m[coord] = v except Exception, e: Log.error("", e)
def format_cube_from_aggop(decoders, aggs, start, query, select): agg = aggs b = coalesce(agg._filter, agg._nested) while b: agg = b b = coalesce(agg._filter, agg._nested) matricies = [(s, Matrix(dims=[], zeros=(s.aggregate == "count"))) for s in select] for s, m in matricies: m[tuple()] = agg[s.pull] cube = Cube(query.select, [], {s.name: m for s, m in matricies}) cube.frum = query return cube
def format_cube(T, select, source): matricies = {} for s in select: try: if s.value == ".": matricies[s.name] = Matrix.wrap(T.select(source)) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([ tuple(unwraplist(t[source][ss]) for ss in s.value) for t in T ]) else: if source == "_source": matricies[s.name] = Matrix.wrap( [unwraplist(t[source][s.value]) for t in T]) elif isinstance(s.value, basestring): # fields matricies[s.name] = Matrix.wrap( [unwraplist(t[source].get(s.value)) for t in T]) else: matricies[s.name] = Matrix.wrap( [unwraplist(t[source].get(s.name)) for t in T]) except Exception, e: Log.error("", e)
def data(): dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = Dict() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = _pull(s, agg) yield output
def es_deepop(es, mvel, query): FromES = es09.util.build_es_query(query) select = query.edges temp_query = query.copy() temp_query.select = select temp_query.edges = DictList() FromES.facets.mvel = { "terms": { "script_field": mvel.code(temp_query), "size": query.limit }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) rows = unpack_terms(data.facets.mvel, query.edges) terms = zip(*rows) # NUMBER ALL EDGES FOR JSON EXPRESSION INDEXING edges = query.edges for f, e in enumerate(edges): for r in terms[f]: e.domain.getPartByKey(r) e.index = f for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE dims = [len(e.domain.partitions) for e in query.edges] output = Matrix(*dims) # FILL CUBE for r in rows: term_coord = [ e.domain.getPartByKey(r[i]).dataIndex for i, e in enumerate(edges) ] output[term_coord] = SUM(output[term_coord], r[-1]) cube = Cube(query.select, query.edges, {query.select.name: output}) cube.frum = query return cube
def format_cube(decoders, aggs, start, query, select): new_edges = count_dim(aggs, decoders) dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) matricies = [(s, Matrix(dims=dims, zeros=(s.aggregate == "count"))) for s in select] for row, agg in aggs_iterator(aggs, decoders): coord = tuple(d.get_index(row) for d in decoders) for s, m in matricies: try: if m[coord]: Log.error("Not expected") m[coord] = agg[s.pull] except Exception, e: tuple(d.get_index(row) for d in decoders) Log.error("", e)
def es_aggop(es, mvel, query): select = listwrap(query.select) FromES = build_es_query(query) isSimple = AND(aggregates[s.aggregate] == "count" for s in select) if isSimple: return es_countop(es, query) # SIMPLE, USE TERMS FACET INSTEAD value2facet = dict() # ONLY ONE FACET NEEDED PER name2facet = dict() # MAP name TO FACET WITH STATS for s in select: if s.value not in value2facet: if is_keyword(s.value): unwrap(FromES.facets)[s.name] = { "statistical": { "field": s.value }, "facet_filter": simplify_esfilter(query.where) } else: unwrap(FromES.facets)[s.name] = { "statistical": { "script": es09.expressions.compile_expression(s.value, query) }, "facet_filter": simplify_esfilter(query.where) } value2facet[s.value] = s.name name2facet[s.name] = value2facet[s.value] data = es09.util.post(es, FromES, query.limit) matricies = { s.name: Matrix(value=fix_es_stats(data.facets[literal_field(s.name)])[ aggregates[s.aggregate]]) for s in select } cube = Cube(query.select, [], matricies) cube.frum = query return cube
def es_countop(es, mvel, query): """ RETURN SINGLE COUNT """ select = listwrap(query.select) FromES = build_es_query(query) for s in select: if is_keyword(s.value): FromES.facets[s.name] = { "terms": { "field": s.value, "size": query.limit, }, "facet_filter": { "exists": { "field": s.value } } } else: # COMPLICATED value IS PROBABLY A SCRIPT, USE IT FromES.facets[s.name] = { "terms": { "script_field": es09.expressions.compile_expression(s.value, query), "size": 200000 } } data = es09.util.post(es, FromES, query.limit) matricies = {} for s in select: matricies[s.name] = Matrix(value=data.hits.facets[s.name].total) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def format_cube(decoders, aggs, start, query, select): new_edges = count_dim(aggs, decoders) dims = [] for e in new_edges: if isinstance(e.value, TupleOp): e.allowNulls = False if e.allowNulls is False: extra = 0 else: extra = 1 dims.append(len(e.domain.partitions) + extra) dims = tuple(dims) matricies = [(s, Matrix(dims=dims, zeros=s.default)) for s in select] for row, coord, agg in aggs_iterator(aggs, decoders): for s, m in matricies: try: v = _pull(s, agg) m[coord] = v except Exception, e: Log.error("", e)
def list_aggs(frum, query): select = listwrap(query.select) is_join = False # True IF MANY TO MANY JOIN WITH AN EDGE for e in query.edges: if isinstance(e.domain, DefaultDomain): e.domain = SimpleSetDomain( partitions=list(sorted(set(frum.select(e.value))))) for s in listwrap(query.select): s["exec"] = qb_expression_to_function(s.value) result = { s.name: Matrix(dims=[ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ], zeros=s.aggregate == "count") for s in select } where = qb_expression_to_function(query.where) for d in filter(where, frum): d = d.copy() coord = [ ] # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE for e in query.edges: coord.append(get_matches(e, d)) for s in select: mat = result[s.name] agg = s.aggregate var = s.value if agg == "count": for c in itertools.product(*coord): if var == "." or var == None: mat[c] += 1 continue for e, cc in zip(query.edges, c): d[e.name] = cc val = s["exec"](d, c, frum) if val != None: mat[c] += 1 else: for c in itertools.product(*coord): acc = mat[c] if acc == None: acc = windows.name2accumulator.get(agg) if acc == None: Log.error( "select aggregate {{agg}} is not recognized", agg=agg) acc = acc(**s) mat[c] = acc for e, cc in zip( query.edges, c ): # BECAUSE WE DO NOT KNOW IF s.exec NEEDS THESE EDGES, SO WE PASS THEM ANYWAY d[e.name] = e.domain.partitions[cc] val = s["exec"](d, c, frum) acc.add(val) for s in select: if s.aggregate == "count": continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() output = Cube(select, query.edges, result) return output
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([ s.value == None and s.aggregate not in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex and len(select) == 1: if not select[0].value: FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(query.where) } } FromES.size = 1 # PREVENT QUERY CHECKER FROM THROWING ERROR elif isKeyword(select[0].value): FromES.facets.mvel = { "terms": { "field": select[0].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } if query.sort: s = query.sort if len(s) > 1: Log.error("can not sort by more than one field") s0 = s[0] if s0.field != select[0].value: Log.error( "can not sort by anything other than count, or term") FromES.facets.terms.order = "term" if s0.sort >= 0 else "reverse_term" elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1: if not select[0].value: # SPECIAL CASE FOR SINGLE COUNT output = Matrix(value=data.hits.total) cube = Cube(query.select, [], {select[0].name: output}) elif isKeyword(select[0].value): # SPECIAL CASE FOR SINGLE TERM T = data.facets.terms output = Matrix.wrap([t.term for t in T]) cube = Cube(query.select, [], {select[0].name: output}) else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube( select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) cube.frum = query return cube
def _es_terms2(es, mvel, query): """ WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value """ # REQUEST VALUES IN FIRST DIMENSION q1 = query.copy() q1.edges = query.edges[0:1:] values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value select = listwrap(query.select) FromES = build_es_query(query) for s in select: for i, v in enumerate(values1): FromES.facets[s.name + "," + str(i)] = { "terms": { "field": query.edges[1].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter({ "and": [query.where, { "term": { query.edges[0].value: v } }] }) } data = es09.util.post(es, FromES, query.limit) # UNION ALL TERMS FROM SECOND DIMENSION values2 = set() for k, f in data.facets.items(): values2.update(f.terms.term) values2 = qb.sort(values2) term2index = {v: i for i, v in enumerate(values2)} query.edges[1].domain.partitions = DictList([{ "name": v, "value": v } for v in values2]) # MAKE CUBE output = {} dims = [len(values1), len(values2)] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): coord = facetName.split(",") s = [s for s in select if s.name == coord[0]][0] i1 = int(coord[1]) for term in facet.terms: i2 = term2index[term.term] output[s.name][(i1, i2)] = term[aggregates[s.aggregate]] cube = Cube(query.select, query.edges, output) cube.query = query return cube
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([ s.value == None and s.aggregate not in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and not select[0].value or select[0].value == "*": FromES = wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter( jx_expression(query.where).to_esfilter()) } }, "sort": query.sort, "size": 1 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(query.where.to_esfilter()) } }, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1 and not select[0].value or select[0].value == "*": # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube( select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Dict(meta={"esquery": FromES}, data=cube)
def es_terms(es, mvel, query): """ RETURN LIST OF ALL EDGE QUERIES EVERY FACET IS NAMED <select.name>, <c1>, ... <cN> WHERE <ci> ARE THE ELEMENT COORDINATES WE TRY TO PACK DIMENSIONS INTO THE TERMS TO MINIMIZE THE CROSS-PRODUCT EXPLOSION """ if len(query.edges) == 2: return _es_terms2(es, mvel, query) select = listwrap(query.select) FromES = build_es_query(query) packed_term = compileEdges2Term(mvel, query.edges, wrap([])) for s in select: FromES.facets[s.name] = { "terms": { "field": packed_term.field, "script_field": packed_term.expression, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } term2Parts = packed_term.term2parts data = es09.util.post(es, FromES, query.limit) # GETTING ALL PARTS WILL EXPAND THE EDGES' DOMAINS # BUT HOW TO UNPACK IT FROM THE term FASTER IS UNKNOWN for k, f in data.facets.items(): for t in f.terms: term2Parts(t.term) # NUMBER ALL EDGES FOR qb INDEXING for f, e in enumerate(query.edges): e.index = f if e.domain.type in ["uid", "default"]: # e.domain.partitions = qb.sort(e.domain.partitions, "value") for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE output = {} dims = [ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): for term in facet.terms: term_coord = term2Parts(term.term).dataIndex for s in select: try: output[s.name][term_coord] = term[aggregates[s.aggregate]] except Exception, e: # USUALLY CAUSED BY output[s.name] NOT BEING BIG ENOUGH TO HANDLE NULL COUNTS pass
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = DictList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error( "There is more than one open-ended edge: self can not be handled" ) specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_keyword(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges) * len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": { "aggregate": "count" }, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note( "{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count=len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, DictList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error( "not implemented yet" ) # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = DictList() constants = DictList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({ "name": fedge.domain.name, "value": parts[f] }) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_keyword(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_keyword(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter( {"and": condition}) data = es09.util.post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = DictList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index] + ( special.dataIndex, ) + pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def cube_aggs(frum, query): select = listwrap(query.select) #MATCH EDGES IN QUERY TO ONES IN frum for e in query.edges: for fs in frum.select: if fs.name == e.value: Log.error("Not implemented yet") if isinstance(e.domain, DefaultDomain): # DEFAULT DOMAINS CAN EASILY BE LOOKED UP FROM frum for fe in frum.edges: if fe.name == e.value: e.domain = SimpleSetDomain(**fe.domain.as_dict()) e.value = e.value + "." + fe.domain.key break else: for fe in frum.edges: if fe.name == e.value: e.value = e.value + "." + fe.domain.key break result = { s.name: Matrix(dims=[ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ], zeros=s.aggregate == "count") for s in select } where = qb_expression_to_function(query.where) for d in filter(where, frum.values()): coord = [ ] # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE for e in query.edges: matches = get_matches(e, d) coord.append(matches) if len(matches) == 1 and d[e.name] == None: d[e.name] = e.domain.partitions[matches[0]] for s in select: mat = result[s.name] agg = s.aggregate var = s.value expr = qb_expression_to_function(var) val = expr(d) if agg == "count": if var == "." or var == None: for c in itertools.product(*coord): mat[c] += 1 continue if val != None: for c in itertools.product(*coord): mat[c] += 1 else: for c in itertools.product(*coord): acc = mat[c] if acc == None: acc = windows.name2accumulator.get(agg) if acc == None: Log.error( "select aggregate {{agg}} is not recognized", agg=agg) acc = acc(**s) mat[c] = acc acc.add(val) for s in select: if s.aggregate == "count": continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() return Cube(select, query.edges, result)
def list_aggs(frum, query): frum = wrap(frum) select = listwrap(query.select) for e in query.edges: if isinstance(e.domain, DefaultDomain): accessor = jx_expression_to_function(e.value) unique_values = set(map(accessor, frum)) if None in unique_values: e.allowNulls = coalesce(e.allowNulls, True) unique_values -= {None} e.domain = SimpleSetDomain(partitions=list(sorted(unique_values))) else: pass s_accessors = [(ss.name, compile_expression(ss.value.to_python())) for ss in select] result = { s.name: Matrix( dims=[len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges], zeros=lambda: windows.name2accumulator.get(s.aggregate)(**s) ) for s in select } where = jx_expression_to_function(query.where) coord = [None]*len(query.edges) edge_accessor = [(i, make_accessor(e)) for i, e in enumerate(query.edges)] net_new_edge_names = set(wrap(query.edges).name) - UNION(e.value.vars() for e in query.edges) if net_new_edge_names & UNION(ss.value.vars() for ss in select): # s_accessor NEEDS THESE EDGES, SO WE PASS THEM ANYWAY for d in filter(where, frum): d = d.copy() for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] for e, cc in zip(query.edges, c): d[e.name] = e.domain.partitions[cc] val = s_accessor(d, c, frum) acc.add(val) else: # FASTER for d in filter(where, frum): for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] val = s_accessor(d, c, frum) acc.add(val) for s in select: # if s.aggregate == "count": # continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() from pyLibrary.queries.containers.cube import Cube output = Cube(select, query.edges, result) return output
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field(query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([s.value == None and s.aggregate not in ("count", "none") for s in select]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex and len(select) == 1: if not select[0].value: FromES.query = {"filtered": { "query": {"match_all": {}}, "filter": simplify_esfilter(query.where) }} FromES.size = 1 # PREVENT QUERY CHECKER FROM THROWING ERROR elif isKeyword(select[0].value): FromES.facets.mvel = { "terms": { "field": select[0].value, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } if query.sort: s = query.sort if len(s) > 1: Log.error("can not sort by more than one field") s0 = s[0] if s0.field != select[0].value: Log.error("can not sort by anything other than count, or term") FromES.facets.terms.order = "term" if s0.sort >= 0 else "reverse_term" elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1: if not select[0].value: # SPECIAL CASE FOR SINGLE COUNT output = Matrix(value=data.hits.total) cube = Cube(query.select, [], {select[0].name: output}) elif isKeyword(select[0].value): # SPECIAL CASE FOR SINGLE TERM T = data.facets.terms output = Matrix.wrap([t.term for t in T]) cube = Cube(query.select, [], {select[0].name: output}) else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube(select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) cube.frum = query return cube
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field(query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([s.value == None and s.aggregate not in ("count", "none") for s in select]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and not select[0].value or select[0].value == "*": FromES = wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) }}, "sort": query.sort, "size": 1 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": {"filtered": { "query": {"match_all": {}}, "filter": simplify_esfilter(query.where.to_esfilter()) }}, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1 and not select[0].value or select[0].value == "*": # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube(select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Data( meta={"esquery": FromES}, data=cube )