def done_count(self): self.computed_domain = True self.edge.domain = self.domain = SimpleSetDomain( key="value", partitions=[{ "value": p, "dataIndex": i } for i, p in enumerate(self.parts)])
def __new__(cls, e=None, query=None, *args, **kwargs): if query.groupby: # GROUPBY ASSUMES WE IGNORE THE DOMAIN RANGE e.allowNulls = False else: e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": if query.groupby: return object.__new__(DefaultDecoder, e) if isinstance(e.value, basestring): Log.error("Expecting Variable or Expression, not plain string") if isinstance(e.value, Variable): cols = query.frum.get_columns() col = cols.filter(lambda c: c.name == e.value.var)[0] if not col: return object.__new__(DefaultDecoder, e) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.partitions != None: e.domain = SimpleSetDomain( partitions=col.partitions[:limit:]) else: e.domain = set_default(DefaultDomain(limit=limit), e.domain.as_dict()) return object.__new__(DefaultDecoder, e) else: return object.__new__(DefaultDecoder, e) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder, e) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder, e) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder, e) if e.range: return object.__new__(GeneralRangeDecoder, e) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder, e) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder, e) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if isinstance(fields, Mapping): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder, e) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder, e) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def done_count(self): self.edge.domain = SimpleSetDomain( key="value", partitions=[{ "value": v, "dataIndex": i } for i, v in enumerate( qb.sort(self.edge.domain.partitions, [k for k, v in self.fields]))])
def done_count(self): columns = map(unicode, range(len(self.fields))) parts = wrap([{unicode(i): p for i, p in enumerate(part)} for part in set(self.parts)]) self.parts = None sorted_parts = jx.sort(parts, columns) self.edge.domain = self.domain = SimpleSetDomain( key="value", partitions=[{"value": tuple(v[k] for k in columns), "dataIndex": i} for i, v in enumerate(sorted_parts)] )
def done_count(self): self.edge.domain = self.domain = SimpleSetDomain( partitions=jx.sort(set(self.parts))) self.parts = None self.computed_domain = True
def __new__(cls, e=None, query=None, *args, **kwargs): e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": # if query.groupby: # return object.__new__(DefaultDecoder, e) if isinstance(e.value, basestring): Log.error("Expecting Variable or Expression, not plain string") if isinstance(e.value, LeavesOp): return object.__new__(ObjectDecoder, e) elif isinstance(e.value, TupleOp): # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS if not all(isinstance(t, Variable) for t in e.value.terms): Log.error("Can only handle variables in tuples") e.domain = Data(dimension={"fields": e.value.terms}) return object.__new__(DimFieldListDecoder, e) elif isinstance(e.value, Variable): schema = query.frum.schema col = schema[e.value.var][0] if col.type in STRUCT: return object.__new__(ObjectDecoder, e) if not col: return object.__new__(DefaultDecoder, e) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.partitions != None: e.domain = SimpleSetDomain( partitions=col.partitions[:limit:]) else: e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder, e) else: return object.__new__(DefaultDecoder, e) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder, e) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder, e) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder, e) if e.range: return object.__new__(GeneralRangeDecoder, e) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder, e) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder, e) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if isinstance(fields, Mapping): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder, e) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder, e) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def list_aggs(frum, query): frum = wrap(frum) select = listwrap(query.select) for e in query.edges: if isinstance(e.domain, DefaultDomain): accessor = jx_expression_to_function(e.value) unique_values = set(map(accessor, frum)) if None in unique_values: e.allowNulls = coalesce(e.allowNulls, True) unique_values -= {None} e.domain = SimpleSetDomain(partitions=list(sorted(unique_values))) else: pass s_accessors = [(ss.name, compile_expression(ss.value.to_python())) for ss in select] result = { s.name: Matrix( dims=[len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges], zeros=lambda: windows.name2accumulator.get(s.aggregate)(**s) ) for s in select } where = jx_expression_to_function(query.where) coord = [None]*len(query.edges) edge_accessor = [(i, make_accessor(e)) for i, e in enumerate(query.edges)] net_new_edge_names = set(wrap(query.edges).name) - UNION(e.value.vars() for e in query.edges) if net_new_edge_names & UNION(ss.value.vars() for ss in select): # s_accessor NEEDS THESE EDGES, SO WE PASS THEM ANYWAY for d in filter(where, frum): d = d.copy() for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] for e, cc in zip(query.edges, c): d[e.name] = e.domain.partitions[cc] val = s_accessor(d, c, frum) acc.add(val) else: # FASTER for d in filter(where, frum): for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] val = s_accessor(d, c, frum) acc.add(val) for s in select: # if s.aggregate == "count": # continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() from pyLibrary.queries.containers.cube import Cube output = Cube(select, query.edges, result) return output
def cube_aggs(frum, query): select = listwrap(query.select) #MATCH EDGES IN QUERY TO ONES IN frum for e in query.edges: for fs in frum.select: if fs.name == e.value: Log.error("Not implemented yet") if isinstance(e.domain, DefaultDomain): # DEFAULT DOMAINS CAN EASILY BE LOOKED UP FROM frum for fe in frum.edges: if fe.name == e.value: e.domain = SimpleSetDomain(**fe.domain.as_dict()) e.value = e.value + "." + fe.domain.key break else: for fe in frum.edges: if fe.name == e.value: e.value = e.value + "." + fe.domain.key break result = { s.name: Matrix(dims=[ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ], zeros=s.aggregate == "count") for s in select } where = qb_expression_to_function(query.where) for d in filter(where, frum.values()): coord = [ ] # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE for e in query.edges: matches = get_matches(e, d) coord.append(matches) if len(matches) == 1 and d[e.name] == None: d[e.name] = e.domain.partitions[matches[0]] for s in select: mat = result[s.name] agg = s.aggregate var = s.value expr = qb_expression_to_function(var) val = expr(d) if agg == "count": if var == "." or var == None: for c in itertools.product(*coord): mat[c] += 1 continue if val != None: for c in itertools.product(*coord): mat[c] += 1 else: for c in itertools.product(*coord): acc = mat[c] if acc == None: acc = windows.name2accumulator.get(agg) if acc == None: Log.error( "select aggregate {{agg}} is not recognized", agg=agg) acc = acc(**s) mat[c] = acc acc.add(val) for s in select: if s.aggregate == "count": continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() return Cube(select, query.edges, result)
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ query["from"] = self query = QueryOp.wrap(query) # TYPE CONFLICTS MUST NOW BE RESOLVED DURING # TYPE-SPECIFIC QUERY NORMALIZATION vars_ = query.vars(exclude_select=True) type_map = { v: c.es_column for v in vars_ if v in self.columns and len([c for c in self.columns[v] if c.type != "nested"]) == 1 for c in self.columns[v] if c.type != "nested" } sql_query = query.map(type_map) new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_table(new_table) + " AS " else: create_table = "" if sql_query.edges: command = create_table + self._edges_op(sql_query) elif sql_query.groupby: command = create_table + self._groupby_op(sql_query) else: command = create_table + self._set_op(sql_query) if sql_query.sort: command += "\nORDER BY " + ",\n".join( s.value.to_sql() + (" DESC" if s.sort == -1 else "") for s in sql_query.sort) result = self.db.query(command) column_names = query.column_names if query.format == "container": output = Table_usingSQLite(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or query.edges: if len(query.edges) > 1: Log.error("Only support one dimension right now") if not result.data: return Dict(data={}) columns = zip(*result.data) edges = [] ci = [] for i, e in enumerate(query.edges): if e.domain.type != "default": Log.error("Can only handle default domains") ci.append(i - len(query.edges)) parts = columns[ci[i]] allowNulls = False if parts[0] == None: allowNulls = True # ONLY ONE EDGE, SO WE CAN DO THIS TO PUT NULL LAST for ii, c in enumerate(copy(columns)): columns[ii] = list(c[1:]) + [c[0]] parts = parts[1:] edges.append( Dict(name=e.name, allowNulls=allowNulls, domain=SimpleSetDomain(partitions=parts))) data = {s.name: columns[i] for i, s in enumerate(sql_query.select)} return Dict(edges=edges, data=data) elif query.format == "list" or (not query.edges and not query.groupby): output = Dict(meta={"format": "list"}, header=column_names, data=[{c: v for c, v in zip(column_names, r)} for r in result.data]) else: Log.error("unknown format {{format}}", format=query.format) return output
def list_aggs(frum, query): select = listwrap(query.select) is_join = False # True IF MANY TO MANY JOIN WITH AN EDGE for e in query.edges: if isinstance(e.domain, DefaultDomain): e.domain = SimpleSetDomain( partitions=list(sorted(set(frum.select(e.value))))) for s in listwrap(query.select): s["exec"] = qb_expression_to_function(s.value) result = { s.name: Matrix(dims=[ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ], zeros=s.aggregate == "count") for s in select } where = qb_expression_to_function(query.where) for d in filter(where, frum): d = d.copy() coord = [ ] # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE for e in query.edges: coord.append(get_matches(e, d)) for s in select: mat = result[s.name] agg = s.aggregate var = s.value if agg == "count": for c in itertools.product(*coord): if var == "." or var == None: mat[c] += 1 continue for e, cc in zip(query.edges, c): d[e.name] = cc val = s["exec"](d, c, frum) if val != None: mat[c] += 1 else: for c in itertools.product(*coord): acc = mat[c] if acc == None: acc = windows.name2accumulator.get(agg) if acc == None: Log.error( "select aggregate {{agg}} is not recognized", agg=agg) acc = acc(**s) mat[c] = acc for e, cc in zip( query.edges, c ): # BECAUSE WE DO NOT KNOW IF s.exec NEEDS THESE EDGES, SO WE PASS THEM ANYWAY d[e.name] = e.domain.partitions[cc] val = s["exec"](d, c, frum) acc.add(val) for s in select: if s.aggregate == "count": continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() output = Cube(select, query.edges, result) return output
def done_count(self): self.edge.domain = SimpleSetDomain( partitions=qb.sort(self.edge.domain.partitions))
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not startswith_field(query['from'], self.name): Log.error("Expecting table, or some nested table") frum, query['from'] = query['from'], self query = QueryOp.wrap(query, self.columns) # TYPE CONFLICTS MUST NOW BE RESOLVED DURING # TYPE-SPECIFIC QUERY NORMALIZATION # vars_ = query.vars(exclude_select=True) # type_map = { # v: c.es_column # for v in vars_ # if v in self.columns and len([c for c in self.columns[v] if c.type != "nested"]) == 1 # for c in self.columns[v] # if c.type != "nested" # } # # sql_query = query.map(type_map) query = query new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_table(new_table) + " AS " else: create_table = "" if query.groupby: op, index_to_columns = self._groupby_op(query, frum) command = create_table + op elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, frum) command = create_table + op else: op = self._set_op(query, frum) return op if query.sort: command += "\nORDER BY " + ",\n".join( "(" + sql[t] + ") IS NULL" + (" DESC" if s.sort == -1 else "") + ",\n" + sql[t] + (" DESC" if s.sort == -1 else "") for s, sql in [(s, s.value.to_sql(self)[0].sql) for s in query.sort] for t in "bns" if sql[t]) result = self.db.query(command) column_names = query.edges.name + query.groupby.name + listwrap( query.select).name if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap( s.pull(result.data[0])) return Data(data=unwrap(data), meta={"format": "cube"}) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain( partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif isinstance(e.value, TupleOp): pulls = jx.sort([ c for c in index_to_columns.values() if c.push_name == e.name ], "push_child").pull parts = [ tuple(p(d) for p in pulls) for d in result.data ] domain = SimpleSetDomain( partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append( Data(name=e.name, allowNulls=allowNulls, domain=domain)) zeros = [ 0 if s.aggregate == "count" and index_to_columns[si].push_child == "." else Data for si, s in enumerate(listwrap(query.select)) ] data = { s.name: Matrix(dims=dims, zeros=zeros[si]) for si, s in enumerate(listwrap(query.select)) } if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()}) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain( partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif isinstance(e.value, TupleOp): pulls = jx.sort([ c for c in index_to_columns.values() if c.push_name == e.name ], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = zip(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append( Data(name=e.name, allowNulls=allowNulls, domain=domain)) zeros = [ 0 if s.aggregate == "count" and index_to_columns[si].push_child == "." else Data for si, s in enumerate(listwrap(query.select)) ] data_cubes = { s.name: Matrix(dims=dims, zeros=zeros[si]) for si, s in enumerate(listwrap(query.select)) } r2c = index_to_coordinate( dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull( row) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()}) elif query.format == "table" or (not query.format and query.groupby): data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[ s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data(meta={"format": "table"}, header=column_names, data=data) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any( listwrap(query.select).aggregate): if isinstance(query.select, list): data = Data() for c in index_to_columns.values(): if c.push_child == ".": data[c.push_name] = c.pull(result.data[0]) else: data[c.push_name][c.push_child] = c.pull( result.data[0]) output = Data(meta={"format": "value"}, data=data) else: data = Data() for s in index_to_columns.values(): data[s.push_child] = s.pull(result.data[0]) output = Data(meta={"format": "value"}, data=unwrap(data)) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[ c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data(meta={"format": "list"}, data=data) else: Log.error("unknown format {{format}}", format=query.format) return output