def _set_op(self, query, frum): # GET LIST OF COLUMNS frum_path = split_field(frum) primary_nested_path = join_field(frum_path[1:]) vars_ = UNION([s.value.vars() for s in listwrap(query.select)]) schema = self.sf.tables[primary_nested_path].schema nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } active_columns = {".": []} for cname, cols in schema.items(): if any(startswith_field(cname, v) for v in vars_): for c in cols: if c.type in STRUCT: continue nest = c.nested_path[0] active = active_columns.get(nest) if not active: active = active_columns[nest] = [] active.append(c) for nested_path, s in self.sf.tables.items(): for cname, cols in s.schema.items(): if not any(startswith_field(cname, c.names[c.nested_path[0]]) for n, cc in active_columns.items() for c in cc): for c in cols: if c.type in STRUCT: continue nest = c.nested_path[0] active = active_columns.get(nest) if not active: active = active_columns[nest] = [] active.append(c) # ANY VARS MENTIONED WITH NO COLUMNS? for v in vars_: if not any(startswith_field(cname, v) for cname in schema.keys()): active_columns["."].append(Column( names={".": v}, type="null", es_column=".", es_index=".", nested_path=["."] )) # EVERY COLUMN, AND THE INDEX IT TAKES UP index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) index_to_uid = {} # FROM NESTED PATH TO THE INDEX OF UID sql_selects = [] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } sorts = [] if query.sort: for s in query.sort: col = s.value.to_sql(schema)[0] for t, sql in col.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) sql_selects.append(sql + " AS " + column_alias) if s.sort == -1: sorts.append(column_alias + " IS NOT NULL") sorts.append(column_alias + " DESC") else: sorts.append(column_alias + " IS NULL") sorts.append(column_alias) selects = [] primary_doc_details = Data() # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED for nested_path, sub_table in self.sf.tables.items(): nested_doc_details = { "sub_table": sub_table, "children": [], "index_to_column": {}, "nested_path": [nested_path] # fake the real nested path, we only look at [0] anyway } # INSERT INTO TREE if not primary_doc_details: primary_doc_details = nested_doc_details else: def place(parent_doc_details): if startswith_field(nested_path, parent_doc_details['nested_path'][0]): for c in parent_doc_details['children']: if place(c): return True parent_doc_details['children'].append(nested_doc_details) place(primary_doc_details) alias = nested_doc_details['alias'] = nest_to_alias[nested_path] if nested_path=="." and quoted_GUID in vars_: column_number = index_to_uid[nested_path] = nested_doc_details['id_coord'] = len(sql_selects) sql_select = alias + "." + quoted_GUID sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name="_id", push_column_name="_id", push_column=0, push_child=".", sql=sql_select, pull=get_column(column_number), type="string", column_alias=_make_column_name(column_number), nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) query.select = [s for s in listwrap(query.select) if s.name!="_id"] # WE ALWAYS ADD THE UID AND ORDER column_number = index_to_uid[nested_path] = nested_doc_details['id_coord'] = len(sql_selects) sql_select = alias + "." + quoted_UID sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) if nested_path !=".": index_to_column[column_number]=ColumnMapping( sql=sql_select, type="number", nested_path=[nested_path], # fake the real nested path, we only look at [0] anyway column_alias=_make_column_name(column_number) ) column_number = len(sql_selects) sql_select = alias + "." + quote_table(ORDER) sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) index_to_column[column_number]=ColumnMapping( sql=sql_select, type="number", nested_path=[nested_path], # fake the real nested path, we only look at [0] anyway column_alias=_make_column_name(column_number) ) # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM if nested_path not in active_columns: continue if len(active_columns[nested_path]) != 0: # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE si = 0 for s in listwrap(query.select): try: column_number = len(sql_selects) s.pull = get_column(column_number) db_columns = s.value.to_sql(schema) if isinstance(s.value, LeavesOp): for column in db_columns: if isinstance(column.nested_path, list): column.nested_path=column.nested_path[0] if column.nested_path and column.nested_path!=nested_path: continue for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=literal_field(get_property_name(concat_field(s.name, column.name))), push_column_name=get_property_name(concat_field(s.name, column.name)), push_column=si, push_child=".", pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) si += 1 else: for column in db_columns: if isinstance(column.nested_path, list): column.nested_path=column.nested_path[0] if column.nested_path and column.nested_path!=nested_path: continue for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=column.name, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) finally: si += 1 elif startswith_field(nested_path, primary_nested_path): # ADD REQUIRED COLUMNS, FOR DEEP STUFF for ci, c in enumerate(active_columns[nested_path]): if c.type in STRUCT: continue column_number = len(sql_selects) nested_path = c.nested_path unsorted_sql = nest_to_alias[nested_path[0]] + "." + quote_table(c.es_column) column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=relative_field(c.names["."], s.name), pull=get_column(column_number), sql=unsorted_sql, type=c.type, column_alias=column_alias, nested_path=nested_path ) where_clause = query.where.to_sql(schema, boolean=True)[0].sql.b unsorted_sql = self._make_sql_for_one_nest_in_set_op( ".", sql_selects, where_clause, active_columns, index_to_column ) for n, _ in self.sf.tables.items(): sorts.append(COLUMN + text_type(index_to_uid[n])) ordered_sql = ( "SELECT * FROM (\n" + unsorted_sql + "\n)" + "\nORDER BY\n" + ",\n".join(sorts) + "\nLIMIT " + quote_value(query.limit) ) self.db.create_new_functions() #creating new functions: regexp result = self.db.query(ordered_sql) def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Null output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append(row) # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc) return output if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Null curr_nested_path = nested_doc_details['nested_path'][0] index_to_column = nested_doc_details['index_to_column'].items() if index_to_column: for i, c in index_to_column: value = row[i] if value == None: continue if value == '': continue if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path=join_field([c.push_name]+split_field(c.push_child)) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path=c.push_child if relative_path == ".": doc = value elif doc is Null: doc = Data() doc[relative_path] = value else: doc[relative_path] = value for child_details in nested_doc_details['children']: # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested(rows, row, child_details, doc_id, id_coord) if nested_value: push_name = child_details['nested_path'][0] if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path=relative_field(push_name, curr_nested_path) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path="." if relative_path == "." and doc is Null: doc = nested_value elif relative_path == ".": doc[push_name] = unwraplist([v[push_name] for v in nested_value]) elif doc is Null: doc = Data() doc[relative_path] = unwraplist(nested_value) else: doc[relative_path] = unwraplist(nested_value) output.append(doc) try: row = rows.pop() except IndexError: return output cols = tuple([i for i in index_to_column.values() if i.push_name != None]) rows = list(reversed(unwrap(result.data))) if rows: row = rows.pop() data = _accumulate_nested(rows, row, primary_doc_details, None, None) else: data = result.data if query.format == "cube": for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): num_rows = len(result.data) num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0 map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = [[None]*num_rows for _ in range(num_cols)] for rownum, d in enumerate(result.data): for c in cols: if c.push_child == ".": temp_data[c.push_column][rownum] = c.pull(d) else: column = temp_data[c.push_column][rownum] if column is None: column = temp_data[c.push_column][rownum] = Data() column[c.push_child] = c.pull(d) output = Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) return output if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): num_rows = len(data) map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = Data() for rownum, d in enumerate(data): for k, v in d.items(): if temp_data[k] == None: temp_data[k] = [None] * num_rows temp_data[k][rownum] = v return Data( meta={"format": "cube"}, data={n: temp_data[literal_field(n)] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) else: num_rows = len(data) map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = [data] return Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) elif query.format == "table": for f, _ in self.sf.tables.items(): if frum.endswith(f): num_column = MAX([c.push_column for c in cols])+1 header = [None]*num_column for c in cols: header[c.push_column] = c.push_column_name output_data = [] for d in result.data: row = [None] * num_column for c in cols: set_column(row, c.push_column, c.push_child, c.pull(d)) output_data.append(row) return Data( meta={"format": "table"}, header=header, data=output_data ) if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): num_rows = len(data) column_names= [None]*(max(c.push_column for c in cols) + 1) for c in cols: column_names[c.push_column] = c.push_column_name temp_data = [] for rownum, d in enumerate(data): row =[None] * len(column_names) for i, (k, v) in enumerate(sorted(d.items())): for c in cols: if k==c.push_name: row[c.push_column] = v temp_data.append(row) return Data( meta={"format": "table"}, header=column_names, data=temp_data ) else: column_names = listwrap(query.select).name return Data( meta={"format": "table"}, header=column_names, data=[[d] for d in data] ) else: for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): data = [] for d in result.data: row = Data() for c in cols: if c.push_child == ".": row[c.push_name] = c.pull(d) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(d) elif not isinstance(query.select, list): # select is value type row[c.push_child]=c.pull(d) else: row[c.push_name][c.push_child] = c.pull(d) data.append(row) return Data( meta={"format": "list"}, data=data ) if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): temp_data=[] for rownum, d in enumerate(data): row = {} for k, v in d.items(): for c in cols: if c.push_name==c.push_column_name==k: row[c.push_column_name] = v elif c.push_name==k and c.push_column_name!=k: row[c.push_column_name] = v temp_data.append(row) return Data( meta={"format": "list"}, data=temp_data ) else: return Data( meta={"format": "list"}, data=data )
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not startswith_field(query['from'], self.sf.fact): Log.error("Expecting table, or some nested table") frum, query['from'] = query['from'], self table = self.sf.tables[relative_field(frum, self.sf.fact)] schema = table.schema query = QueryOp.wrap(query, table=table, schema=schema) new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_column(new_table) + " AS " else: create_table = "" if query.groupby and query.format != "cube": op, index_to_columns = self._groupby_op(query, frum) command = create_table + op elif query.groupby: query.edges, query.groupby = query.groupby, query.edges op, index_to_columns = self._edges_op(query, frum) command = create_table + op query.edges, query.groupby = query.groupby, query.edges elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, frum) command = create_table + op else: op = self._set_op(query, frum) return op result = self.db.query(command) if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap(s.pull(result.data[0])) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( data=unwrap(data), select=select, meta={"format": "cube"} ) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif isinstance(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data[s.name] = Matrix(dims=dims, zeros=0) else: data[s.name] = Matrix(dims=dims) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()} ) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif isinstance(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = zip(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} if query.sort[i].sort == -1: domain = SimpleSetDomain(partitions=wrap(sorted(parts, reverse=True))) else: domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data_cubes = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data_cubes[s.name] = Matrix(dims=dims, zeros=0) else: data_cubes[s.name] = Matrix(dims=dims) r2c = index_to_coordinate(dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull(row) if query.select == None: select = Null elif isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()} ) elif query.format == "table" or (not query.format and query.groupby): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data( meta={"format": "table"}, header=column_names, data=data ) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any(listwrap(query.select).aggregate): if isinstance(query.select, list): data = Data() for c in index_to_columns.values(): if c.push_child == ".": if data[c.push_name] == None: data[c.push_name] = c.pull(result.data[0]) elif isinstance(data[c.push_name], list): data[c.push_name].append(c.pull(result.data[0])) else: data[c.push_name] = [data[c.push_name], c.pull(result.data[0])] else: data[c.push_name][c.push_child] = c.pull(result.data[0]) output = Data( meta={"format": "value"}, data=data ) else: data = Data() for s in index_to_columns.values(): if not data[s.push_child]: data[s.push_child] = s.pull(result.data[0]) else: data[s.push_child] += [s.pull(result.data[0])] output = Data( meta={"format": "value"}, data=unwrap(data) ) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data( meta={"format": "list"}, data=data ) else: Log.error("unknown format {{format}}", format=query.format) return output
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not startswith_field(query['from'], self.name): Log.error("Expecting table, or some nested table") frum, query['from'] = query['from'], self query = QueryOp.wrap(query, self.columns) # TYPE CONFLICTS MUST NOW BE RESOLVED DURING # TYPE-SPECIFIC QUERY NORMALIZATION # vars_ = query.vars(exclude_select=True) # type_map = { # v: c.es_column # for v in vars_ # if v in self.columns and len([c for c in self.columns[v] if c.type != "nested"]) == 1 # for c in self.columns[v] # if c.type != "nested" # } # # sql_query = query.map(type_map) query = query new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_table(new_table) + " AS " else: create_table = "" if query.groupby: op, index_to_columns = self._groupby_op(query, frum) command = create_table + op elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, frum) command = create_table + op else: op = self._set_op(query, frum) return op if query.sort: command += "\nORDER BY " + ",\n".join( "(" + sql[t] + ") IS NULL" + (" DESC" if s.sort == -1 else "") + ",\n" + sql[t] + (" DESC" if s.sort == -1 else "") for s, sql in [(s, s.value.to_sql(self)[0].sql) for s in query.sort] for t in "bns" if sql[t]) result = self.db.query(command) column_names = query.edges.name + query.groupby.name + listwrap( query.select).name if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap( s.pull(result.data[0])) return Data(data=unwrap(data), meta={"format": "cube"}) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain( partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif isinstance(e.value, TupleOp): pulls = jx.sort([ c for c in index_to_columns.values() if c.push_name == e.name ], "push_child").pull parts = [ tuple(p(d) for p in pulls) for d in result.data ] domain = SimpleSetDomain( partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append( Data(name=e.name, allowNulls=allowNulls, domain=domain)) zeros = [ 0 if s.aggregate == "count" and index_to_columns[si].push_child == "." else Data for si, s in enumerate(listwrap(query.select)) ] data = { s.name: Matrix(dims=dims, zeros=zeros[si]) for si, s in enumerate(listwrap(query.select)) } if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()}) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain( partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif isinstance(e.value, TupleOp): pulls = jx.sort([ c for c in index_to_columns.values() if c.push_name == e.name ], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = zip(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append( Data(name=e.name, allowNulls=allowNulls, domain=domain)) zeros = [ 0 if s.aggregate == "count" and index_to_columns[si].push_child == "." else Data for si, s in enumerate(listwrap(query.select)) ] data_cubes = { s.name: Matrix(dims=dims, zeros=zeros[si]) for si, s in enumerate(listwrap(query.select)) } r2c = index_to_coordinate( dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull( row) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()}) elif query.format == "table" or (not query.format and query.groupby): data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[ s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data(meta={"format": "table"}, header=column_names, data=data) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any( listwrap(query.select).aggregate): if isinstance(query.select, list): data = Data() for c in index_to_columns.values(): if c.push_child == ".": data[c.push_name] = c.pull(result.data[0]) else: data[c.push_name][c.push_child] = c.pull( result.data[0]) output = Data(meta={"format": "value"}, data=data) else: data = Data() for s in index_to_columns.values(): data[s.push_child] = s.pull(result.data[0]) output = Data(meta={"format": "value"}, data=unwrap(data)) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[ c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data(meta={"format": "list"}, data=data) else: Log.error("unknown format {{format}}", format=query.format) return output