def where(self, filter): """ WILL NOT PULL WHOLE OBJECT, JUST TOP-LEVEL PROPERTIES :param filter: jx_expression filter :return: list of objects that match """ select = [] column_names = [] for cname, cs in self.columns.items(): cs = [ c for c in cs if c.type not in STRUCT and len(c.nested_path) == 1 ] if len(cs) == 0: continue column_names.append(cname) if len(cs) == 1: select.append( quote_table(c.es_column) + " " + quote_table(c.name)) else: select.append("coalesce(" + ",".join(quote_table(c.es_column) for c in cs) + ") " + quote_table(c.name)) result = self.db.query(" SELECT " + "\n,".join(select) + " FROM " + quote_table(self.sf.fact) + " WHERE " + jx_expression(filter).to_sql()) return wrap([{c: v for c, v in zip(column_names, r)} for r in result.data])
def _groupby_op(self, query, frum): columns = self._get_sql_schema(frum) index_to_column = {} nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.nested_tables.items()) } selects = [] groupby = [] for i, e in enumerate(query.groupby): column_number = len(selects) sql_type, sql = e.value.to_sql(self)[0].sql.items()[0] groupby.append(sql) selects.append(sql + " AS " + e.name) index_to_column[column_number] = Data( is_edge=True, push_name=e.name, push_column=column_number, push_child=".", pull=get_column(column_number), sql=sql, type=sql_type_to_json_type[sql_type]) for s in listwrap(query.select): column_number = len(selects) sql_type, sql = s.value.to_sql(self)[0].sql.items()[0] if s.value == "." and s.aggregate == "count": selects.append("COUNT(1) AS " + quote_table(s.name)) else: selects.append(sql_aggs[s.aggregate] + "(" + sql + ") AS " + quote_table(s.name)) index_to_column[column_number] = Data( push_name=s.name, push_column=column_number, push_child=".", pull=get_column(column_number), sql=sql, type=sql_type_to_json_type[sql_type]) for w in query.window: selects.append(self._window_op(self, query, w)) where = query.where.to_sql(self)[0].sql.b command = "SELECT\n" + (",\n".join(selects)) + \ "\nFROM\n" + quote_table(self.name) + " " + nest_to_alias["."] + \ "\nWHERE\n" + where + \ "\nGROUP BY\n" + ",\n".join(groupby) return command, index_to_column
def _window_op(self, query, window): # http://www2.sqlite.org/cvstrac/wiki?p=UnsupportedSqlAnalyticalFunctions if window.value == "rownum": return "ROW_NUMBER()-1 OVER (" + \ " PARTITION BY " + (", ".join(window.edges.values)) + \ " ORDER BY " + (", ".join(window.edges.sort)) + \ ") AS " + quote_table(window.name) range_min = text_type(coalesce(window.range.min, "UNBOUNDED")) range_max = text_type(coalesce(window.range.max, "UNBOUNDED")) return sql_aggs[window.aggregate] + "(" + window.value.to_sql() + ") OVER (" + \ " PARTITION BY " + (", ".join(window.edges.values)) + \ " ORDER BY " + (", ".join(window.edges.sort)) + \ " ROWS BETWEEN " + range_min + " PRECEDING AND " + range_max + " FOLLOWING " + \ ") AS " + quote_table(window.name)
def create_fact(self, uid=UID): """ MAKE NEW TABLE WITH GIVEN guid :param uid: name, or list of names, for the GUID :return: None """ self.add_table_to_schema(["."]) uid = listwrap(uid) new_columns = [] for u in uid: if u == UID: pass else: c = Column(names={".": u}, type="string", es_column=typed_column(u, "string"), es_index=self.fact) self.add_column_to_schema(c) new_columns.append(c) command = ( "CREATE TABLE " + quote_table(self.fact) + "(" + (",".join([quoted_GUID + " TEXT "] + [quoted_UID + " INTEGER"] + [ quote_column(c.es_column) + " " + sql_types[c.type] for c in self.tables["."].schema.columns ])) + ", PRIMARY KEY (" + (", ".join([quoted_GUID] + [quoted_UID] + [ quote_column(c.es_column) for c in self.tables["."].schema.columns ])) + "))") self.db.execute(command)
def _nest_column(self, column, new_path): destination_table = join_field([self.name] + split_field(new_path)) existing_table = join_field([self.name] + split_field(column.nested_path[0])) # FIND THE INNER COLUMNS WE WILL BE MOVING new_columns = {} for cname, cols in self.columns.items(): if startswith_field(cname, column.names[self.name]): new_columns[cname] = set() for col in cols: new_columns[cname].add(col) col.nested_path = [new_path] + col.nested_path # TODO: IF THERE ARE CHILD TABLES, WE MUST UPDATE THEIR RELATIONS TOO? # DEFINE A NEW TABLE? # LOAD THE COLUMNS command = "PRAGMA table_info(" + quote_table(destination_table) + ")" details = self.db.query(command) if details.data: raise Log.error("not expected, new nesting!") from jx_sqlite.query_table import QueryTable self.nested_tables[new_path] = sub_table = QueryTable( destination_table, self.db, exists=False) self.db.execute("ALTER TABLE " + quote_table(sub_table.name) + " ADD COLUMN " + quoted_PARENT + " INTEGER") self.db.execute("ALTER TABLE " + quote_table(sub_table.name) + " ADD COLUMN " + quote_table(ORDER) + " INTEGER") for cname, cols in new_columns.items(): for c in cols: sub_table.add_column(c) # TEST IF THERE IS ANY DATA IN THE NEW NESTED ARRAY all_cols = [c for _, cols in sub_table.columns.items() for c in cols] if not all_cols: has_nested_data = "0" elif len(all_cols) == 1: has_nested_data = _quote_column(all_cols[0]) + " is NOT NULL" else: has_nested_data = "COALESCE(" + \ ",".join(_quote_column(c) for c in all_cols) + \ ") IS NOT NULL" # FILL TABLE WITH EXISTING COLUMN DATA command = "INSERT INTO " + quote_table(destination_table) + "(\n" + \ ",\n".join( [quoted_UID, quoted_PARENT, quote_table(ORDER)] + [_quote_column(c) for _, cols in sub_table.columns.items() for c in cols] ) + \ "\n)\n" + \ "\nSELECT\n" + ",".join( [quoted_UID, quoted_UID, "0"] + [_quote_column(c) for _, cols in sub_table.columns.items() for c in cols] ) + \ "\nFROM\n" + quote_table(existing_table) + \ "\nWHERE\n" + has_nested_data self.db.execute(command)
def __getitem__(self, item): cs = self.columns.get(item, None) if not cs: return [Null] command = " UNION ALL ".join("SELECT " + _quote_column(c) + " FROM " + quote_table(c.es_index) for c in cs) output = self.db.query(command) return [o[0] for o in output]
def _add_column(self, column): cname = column.names["."] if column.type == "nested": # WE ARE ALSO NESTING self._nest_column(column, cname) table = concat_field(self.fact, column.nested_path[0]) self.db.execute("ALTER TABLE " + quote_table(table) + " ADD COLUMN " + quote_column(column.es_column) + " " + sql_types[column.type]) self.add_column_to_schema(column)
def __iter__(self): columns = [ c for c, cs in self.columns.items() for c in cs if c.type not in STRUCT ] command = "SELECT " + \ ",\n".join(_quote_column(c) for c in columns) + \ " FROM " + quote_table(self.name) rows = self.db.query(command) for r in rows: output = Data() for (k, t), v in zip(columns, r): output[k] = v yield output
def read_db(self): """ PULL SCHEMA FROM DATABASE, BUILD THE MODEL :return: None """ # FIND ALL TABLES result = self.db.query( "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name") tables = wrap([{k: d[i] for i, k in enumerate(result.header)} for d in result.data]) tables_found = False for table in tables: if table.name.startswith("__"): continue tables_found = True nested_path = [ join_field(split_field(tab.name)[1:]) for tab in jx.reverse(tables) if startswith_field(table.name, tab.name) ] self.add_table_to_schema(nested_path) # LOAD THE COLUMNS command = "PRAGMA table_info(" + quote_table(table.name) + ")" details = self.db.query(command) for cid, name, dtype, notnull, dfft_value, pk in details.data: if name.startswith("__"): continue cname, ctype = untyped_column(name) column = Column(names={ np: relative_field(cname, np) for np in nested_path }, type=coalesce( ctype, { "TEXT": "string", "REAL": "number", "INTEGER": "integer" }.get(dtype)), nested_path=nested_path, es_column=name, es_index=table.name) self.add_column_to_schema(column) return tables_found
def add_column(self, column): """ ADD COLUMN, IF IT DOES NOT EXIST ALREADY """ if column.name not in self.columns: self.columns[column.name] = {column} elif column.type not in [c.type for c in self.columns[column.name]]: self.columns[column.name].add(column) if column.type == "nested": nested_table_name = concat_field(self.name, column.name) # MAKE THE TABLE from jx_sqlite.query_table import QueryTable table = QueryTable(nested_table_name, self.db, exists=False) self.nested_tables[column.name] = table else: self.db.execute("ALTER TABLE " + quote_table(self.name) + " ADD COLUMN " + _quote_column(column) + " " + column.type)
def change_schema(self, required_changes): required_changes = wrap(required_changes) for required_change in required_changes: if required_change.add: column = required_change.add if column.type == "nested": # WE ARE ALSO NESTING self._nest_column(column, column.names[self.name]) table = join_field([self.name] + split_field(column.nested_path[0])) self.db.execute("ALTER TABLE " + quote_table(table) + " ADD COLUMN " + _quote_column(column) + " " + sql_types[column.type]) self.columns.add(column) elif required_change.nest: column = required_change.nest new_path = required_change.new_path self._nest_column(column, new_path)
def _insert(self, collection): for nested_path, details in collection.items(): active_columns = wrap(list(details.active_columns)) rows = details.rows table_name = concat_field(self.sf.fact, nested_path) if table_name == self.sf.fact: # DO NOT REQUIRE PARENT OR ORDER COLUMNS meta_columns = [GUID, UID] else: meta_columns = [UID, PARENT, ORDER] all_columns = meta_columns + active_columns.es_column prefix = "INSERT INTO " + quote_table(table_name) + \ "(" + ",".join(map(quote_table, all_columns)) + ")" # BUILD THE RECORDS records = " UNION ALL ".join( "\nSELECT " + ",".join(quote_value(row.get(c)) for c in all_columns) for row in unwrap(rows)) self.db.execute(prefix + records)
def update(self, command): """ :param command: EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT """ command = wrap(command) # REJECT DEEP UPDATES touched_columns = command.set.keys() | set(listwrap(command['clear'])) for c in self.get_leaves(): if c.name in touched_columns and c.nested_path and len( c.name) > len(c.nested_path[0]): Log.error("Deep update not supported") # ADD NEW COLUMNS where = jx_expression(command.where) _vars = where.vars() _map = { v: c.es_column for v in _vars for c in self.columns.get(v, Null) if c.type not in STRUCT } where_sql = where.map(_map).to_sql() new_columns = set(command.set.keys()) - set(self.columns.keys()) for new_column_name in new_columns: nested_value = command.set[new_column_name] ctype = get_type(nested_value) column = Column(names={".": new_column_name}, type=ctype, es_index=self.sf.fact, es_column=typed_column(new_column_name, ctype)) self.add_column(column) # UPDATE THE NESTED VALUES for nested_column_name, nested_value in command.set.items(): if get_type(nested_value) == "nested": nested_table_name = concat_field(self.sf.fact, nested_column_name) nested_table = nested_tables[nested_column_name] self_primary_key = ",".join( quote_table(c.es_column) for u in self.uid for c in self.columns[u]) extra_key_name = UID_PREFIX + "id" + text_type(len(self.uid)) extra_key = [e for e in nested_table.columns[extra_key_name]][0] sql_command = "DELETE FROM " + quote_table(nested_table.name) + \ "\nWHERE EXISTS (" + \ "\nSELECT 1 " + \ "\nFROM " + quote_table(nested_table.name) + " n" + \ "\nJOIN (" + \ "\nSELECT " + self_primary_key + \ "\nFROM " + quote_table(self.sf.fact) + \ "\nWHERE " + where_sql + \ "\n) t ON " + \ " AND ".join( "t." + quote_table(c.es_column) + " = n." + quote_table(c.es_column) for u in self.uid for c in self.columns[u] ) + \ ")" self.db.execute(sql_command) # INSERT NEW RECORDS if not nested_value: continue doc_collection = {} for d in listwrap(nested_value): nested_table.flatten(d, Data(), doc_collection, path=nested_column_name) prefix = "INSERT INTO " + quote_table(nested_table.name) + \ "(" + \ self_primary_key + "," + \ quote_column(extra_key) + "," + \ ",".join( quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns ) + ")" # BUILD THE PARENT TABLES parent = "\nSELECT " + \ self_primary_key + \ "\nFROM " + quote_table(self.sf.fact) + \ "\nWHERE " + jx_expression(command.where).to_sql() # BUILD THE RECORDS children = " UNION ALL ".join( "\nSELECT " + quote_value(i) + " " + quote_table(extra_key.es_column) + "," + ",".join( quote_value(row[c.name]) + " " + quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns) for i, row in enumerate( doc_collection.get(".", Null).rows)) sql_command = prefix + \ "\nSELECT " + \ ",".join( "p." + quote_table(c.es_column) for u in self.uid for c in self.columns[u] ) + "," + \ "c." + quote_column(extra_key) + "," + \ ",".join( "c." + quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns ) + \ "\nFROM (" + parent + ") p " + \ "\nJOIN (" + children + \ "\n) c on 1=1" self.db.execute(sql_command) # THE CHILD COLUMNS COULD HAVE EXPANDED # ADD COLUMNS TO SELF for n, cs in nested_table.columns.items(): for c in cs: column = Column(names={".": c.name}, type=c.type, es_index=c.es_index, es_column=c.es_column, nested_path=[nested_column_name] + c.nested_path) if c.name not in self.columns: self.columns[column.name] = {column} elif c.type not in [ c.type for c in self.columns[c.name] ]: self.columns[column.name].add(column) command = ( "UPDATE " + quote_table(self.sf.fact) + " SET " + ",\n".join([ quote_column(c) + "=" + quote_value(get_if_type(v, c.type)) for k, v in command.set.items() if get_type(v) != "nested" for c in self.columns[k] if c.type != "nested" and len(c.nested_path) == 1 ] + [ quote_column(c) + "=NULL" for k in listwrap(command['clear']) if k in self.columns for c in self.columns[k] if c.type != "nested" and len(c.nested_path) == 1 ]) + " WHERE " + where_sql) self.db.execute(command)
def _make_sql_for_one_nest_in_set_op( self, primary_nested_path, selects, # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE where_clause, active_columns, index_to_sql_select # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) ): """ FOR EACH NESTED LEVEL, WE MAKE A QUERY THAT PULLS THE VALUES/COLUMNS REQUIRED WE `UNION ALL` THEM WHEN DONE :param primary_nested_path: :param selects: :param where_clause: :param active_columns: :param index_to_sql_select: :return: SQL FOR ONE NESTED LEVEL """ parent_alias = "a" from_clause = "" select_clause = [] children_sql = [] done = [] # STATEMENT FOR EACH NESTED PATH for i, (nested_path, sub_table) in enumerate(self.nested_tables.items()): if any(startswith_field(nested_path, d) for d in done): continue alias = "__" + unichr(ord('a') + i) + "__" if primary_nested_path == nested_path: select_clause = [] # ADD SELECT CLAUSE HERE for select_index, s in enumerate(selects): sql_select = index_to_sql_select.get(select_index) if not sql_select: select_clause.append(selects[select_index]) continue if startswith_field(sql_select.nested_path[0], nested_path): select_clause.append(sql_select.sql + " AS " + _make_column_name(select_index)) else: # DO NOT INCLUDE DEEP STUFF AT THIS LEVEL select_clause.append("NULL AS " + _make_column_name(select_index)) if nested_path == ".": from_clause += "\nFROM " + quote_table( self.name) + " " + alias + "\n" else: from_clause += "\nLEFT JOIN " + quote_table(sub_table.name) + " " + alias + "\n" \ " ON " + alias + "." + quoted_PARENT + " = " + parent_alias + "." + quoted_UID + "\n" where_clause = "(" + where_clause + ") AND " + alias + "." + quote_table( ORDER) + " > 0\n" elif startswith_field(primary_nested_path, nested_path): # PARENT TABLE # NO NEED TO INCLUDE COLUMNS, BUT WILL INCLUDE ID AND ORDER if nested_path == ".": from_clause += "\nFROM " + quote_table( self.name) + " " + alias + "\n" else: parent_alias = alias = unichr(ord('a') + i - 1) from_clause += "\nLEFT JOIN " + quote_table(sub_table.name) + " " + alias + \ " ON " + alias + "." + quoted_PARENT + " = " + parent_alias + "." + quoted_UID where_clause = "(" + where_clause + ") AND " + parent_alias + "." + quote_table( ORDER) + " > 0\n" elif startswith_field(nested_path, primary_nested_path): # CHILD TABLE # GET FIRST ROW FOR EACH NESTED TABLE from_clause += "\nLEFT JOIN " + quote_table(sub_table.name) + " " + alias + \ " ON " + alias + "." + quoted_PARENT + " = " + parent_alias + "." + quoted_UID + \ " AND " + alias + "." + quote_table(ORDER) + " = 0\n" # IMMEDIATE CHILDREN ONLY done.append(nested_path) # NESTED TABLES WILL USE RECURSION children_sql.append( self._make_sql_for_one_nest_in_set_op( nested_path, selects, # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE where_clause, active_columns, index_to_sql_select # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) )) else: # SIBLING PATHS ARE IGNORED continue parent_alias = alias sql = "\nUNION ALL\n".join([ "SELECT\n" + ",\n".join(select_clause) + from_clause + "\nWHERE\n" + where_clause ] + children_sql) return sql
def _set_op(self, query, frum): # GET LIST OF COLUMNS frum_path = split_field(frum) primary_nested_path = join_field(frum_path[1:]) vars_ = UNION([s.value.vars() for s in listwrap(query.select)]) schema = self.sf.tables[primary_nested_path].schema nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } active_columns = {".": []} for cname, cols in schema.items(): if any(startswith_field(cname, v) for v in vars_): for c in cols: if c.type in STRUCT: continue nest = c.nested_path[0] active = active_columns.get(nest) if not active: active = active_columns[nest] = [] active.append(c) for nested_path, s in self.sf.tables.items(): for cname, cols in s.schema.items(): if not any(startswith_field(cname, c.names[c.nested_path[0]]) for n, cc in active_columns.items() for c in cc): for c in cols: if c.type in STRUCT: continue nest = c.nested_path[0] active = active_columns.get(nest) if not active: active = active_columns[nest] = [] active.append(c) # ANY VARS MENTIONED WITH NO COLUMNS? for v in vars_: if not any(startswith_field(cname, v) for cname in schema.keys()): active_columns["."].append(Column( names={".": v}, type="null", es_column=".", es_index=".", nested_path=["."] )) # EVERY COLUMN, AND THE INDEX IT TAKES UP index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) index_to_uid = {} # FROM NESTED PATH TO THE INDEX OF UID sql_selects = [] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } sorts = [] if query.sort: for s in query.sort: col = s.value.to_sql(schema)[0] for t, sql in col.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) sql_selects.append(sql + " AS " + column_alias) if s.sort == -1: sorts.append(column_alias + " IS NOT NULL") sorts.append(column_alias + " DESC") else: sorts.append(column_alias + " IS NULL") sorts.append(column_alias) selects = [] primary_doc_details = Data() # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED for nested_path, sub_table in self.sf.tables.items(): nested_doc_details = { "sub_table": sub_table, "children": [], "index_to_column": {}, "nested_path": [nested_path] # fake the real nested path, we only look at [0] anyway } # INSERT INTO TREE if not primary_doc_details: primary_doc_details = nested_doc_details else: def place(parent_doc_details): if startswith_field(nested_path, parent_doc_details['nested_path'][0]): for c in parent_doc_details['children']: if place(c): return True parent_doc_details['children'].append(nested_doc_details) place(primary_doc_details) alias = nested_doc_details['alias'] = nest_to_alias[nested_path] if nested_path=="." and quoted_GUID in vars_: column_number = index_to_uid[nested_path] = nested_doc_details['id_coord'] = len(sql_selects) sql_select = alias + "." + quoted_GUID sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name="_id", push_column_name="_id", push_column=0, push_child=".", sql=sql_select, pull=get_column(column_number), type="string", column_alias=_make_column_name(column_number), nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) query.select = [s for s in listwrap(query.select) if s.name!="_id"] # WE ALWAYS ADD THE UID AND ORDER column_number = index_to_uid[nested_path] = nested_doc_details['id_coord'] = len(sql_selects) sql_select = alias + "." + quoted_UID sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) if nested_path !=".": index_to_column[column_number]=ColumnMapping( sql=sql_select, type="number", nested_path=[nested_path], # fake the real nested path, we only look at [0] anyway column_alias=_make_column_name(column_number) ) column_number = len(sql_selects) sql_select = alias + "." + quote_table(ORDER) sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) index_to_column[column_number]=ColumnMapping( sql=sql_select, type="number", nested_path=[nested_path], # fake the real nested path, we only look at [0] anyway column_alias=_make_column_name(column_number) ) # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM if nested_path not in active_columns: continue if len(active_columns[nested_path]) != 0: # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE si = 0 for s in listwrap(query.select): try: column_number = len(sql_selects) s.pull = get_column(column_number) db_columns = s.value.to_sql(schema) if isinstance(s.value, LeavesOp): for column in db_columns: if isinstance(column.nested_path, list): column.nested_path=column.nested_path[0] if column.nested_path and column.nested_path!=nested_path: continue for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=literal_field(get_property_name(concat_field(s.name, column.name))), push_column_name=get_property_name(concat_field(s.name, column.name)), push_column=si, push_child=".", pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) si += 1 else: for column in db_columns: if isinstance(column.nested_path, list): column.nested_path=column.nested_path[0] if column.nested_path and column.nested_path!=nested_path: continue for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=column.name, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) finally: si += 1 elif startswith_field(nested_path, primary_nested_path): # ADD REQUIRED COLUMNS, FOR DEEP STUFF for ci, c in enumerate(active_columns[nested_path]): if c.type in STRUCT: continue column_number = len(sql_selects) nested_path = c.nested_path unsorted_sql = nest_to_alias[nested_path[0]] + "." + quote_table(c.es_column) column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=relative_field(c.names["."], s.name), pull=get_column(column_number), sql=unsorted_sql, type=c.type, column_alias=column_alias, nested_path=nested_path ) where_clause = query.where.to_sql(schema, boolean=True)[0].sql.b unsorted_sql = self._make_sql_for_one_nest_in_set_op( ".", sql_selects, where_clause, active_columns, index_to_column ) for n, _ in self.sf.tables.items(): sorts.append(COLUMN + text_type(index_to_uid[n])) ordered_sql = ( "SELECT * FROM (\n" + unsorted_sql + "\n)" + "\nORDER BY\n" + ",\n".join(sorts) + "\nLIMIT " + quote_value(query.limit) ) self.db.create_new_functions() #creating new functions: regexp result = self.db.query(ordered_sql) def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Null output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append(row) # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc) return output if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Null curr_nested_path = nested_doc_details['nested_path'][0] index_to_column = nested_doc_details['index_to_column'].items() if index_to_column: for i, c in index_to_column: value = row[i] if value == None: continue if value == '': continue if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path=join_field([c.push_name]+split_field(c.push_child)) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path=c.push_child if relative_path == ".": doc = value elif doc is Null: doc = Data() doc[relative_path] = value else: doc[relative_path] = value for child_details in nested_doc_details['children']: # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested(rows, row, child_details, doc_id, id_coord) if nested_value: push_name = child_details['nested_path'][0] if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path=relative_field(push_name, curr_nested_path) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path="." if relative_path == "." and doc is Null: doc = nested_value elif relative_path == ".": doc[push_name] = unwraplist([v[push_name] for v in nested_value]) elif doc is Null: doc = Data() doc[relative_path] = unwraplist(nested_value) else: doc[relative_path] = unwraplist(nested_value) output.append(doc) try: row = rows.pop() except IndexError: return output cols = tuple([i for i in index_to_column.values() if i.push_name != None]) rows = list(reversed(unwrap(result.data))) if rows: row = rows.pop() data = _accumulate_nested(rows, row, primary_doc_details, None, None) else: data = result.data if query.format == "cube": for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): num_rows = len(result.data) num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0 map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = [[None]*num_rows for _ in range(num_cols)] for rownum, d in enumerate(result.data): for c in cols: if c.push_child == ".": temp_data[c.push_column][rownum] = c.pull(d) else: column = temp_data[c.push_column][rownum] if column is None: column = temp_data[c.push_column][rownum] = Data() column[c.push_child] = c.pull(d) output = Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) return output if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): num_rows = len(data) map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = Data() for rownum, d in enumerate(data): for k, v in d.items(): if temp_data[k] == None: temp_data[k] = [None] * num_rows temp_data[k][rownum] = v return Data( meta={"format": "cube"}, data={n: temp_data[literal_field(n)] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) else: num_rows = len(data) map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = [data] return Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) elif query.format == "table": for f, _ in self.sf.tables.items(): if frum.endswith(f): num_column = MAX([c.push_column for c in cols])+1 header = [None]*num_column for c in cols: header[c.push_column] = c.push_column_name output_data = [] for d in result.data: row = [None] * num_column for c in cols: set_column(row, c.push_column, c.push_child, c.pull(d)) output_data.append(row) return Data( meta={"format": "table"}, header=header, data=output_data ) if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): num_rows = len(data) column_names= [None]*(max(c.push_column for c in cols) + 1) for c in cols: column_names[c.push_column] = c.push_column_name temp_data = [] for rownum, d in enumerate(data): row =[None] * len(column_names) for i, (k, v) in enumerate(sorted(d.items())): for c in cols: if k==c.push_name: row[c.push_column] = v temp_data.append(row) return Data( meta={"format": "table"}, header=column_names, data=temp_data ) else: column_names = listwrap(query.select).name return Data( meta={"format": "table"}, header=column_names, data=[[d] for d in data] ) else: for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): data = [] for d in result.data: row = Data() for c in cols: if c.push_child == ".": row[c.push_name] = c.pull(d) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(d) elif not isinstance(query.select, list): # select is value type row[c.push_child]=c.pull(d) else: row[c.push_name][c.push_child] = c.pull(d) data.append(row) return Data( meta={"format": "list"}, data=data ) if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): temp_data=[] for rownum, d in enumerate(data): row = {} for k, v in d.items(): for c in cols: if c.push_name==c.push_column_name==k: row[c.push_column_name] = v elif c.push_name==k and c.push_column_name!=k: row[c.push_column_name] = v temp_data.append(row) return Data( meta={"format": "list"}, data=temp_data ) else: return Data( meta={"format": "list"}, data=data )
def _set_op(self, query, frum): # GET LIST OF COLUMNS primary_nested_path = join_field(split_field(frum)[1:]) vars_ = UNION([s.value.vars() for s in listwrap(query.select)]) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.nested_tables.items()) } active_columns = {".": []} for cname, cols in self.columns.items(): if any(startswith_field(cname, v) for v in vars_): for c in cols: if c.type in STRUCT: continue nest = c.nested_path[0] active = active_columns.get(nest) if not active: active = active_columns[nest] = [] active.append(c) # ANY VARS MENTIONED WITH NO COLUMNS? for v in vars_: if not any( startswith_field(cname, v) for cname in self.columns.keys()): active_columns["."].append( Column(names={self.name: v}, type="null", es_column=".", es_index=".", nested_path=["."])) # EVERY COLUMN, AND THE INDEX IT TAKES UP index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) index_to_uid = {} # FROM NESTED PATH TO THE INDEX OF UID sql_selects = [ ] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.nested_tables.items()) } sorts = [] if query.sort: for s in query.sort: col = s.value.to_sql(self)[0] for t, sql in col.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) sql_selects.append(sql + " AS " + column_alias) if s.sort == -1: sorts.append(column_alias + " IS NOT NULL") sorts.append(column_alias + " DESC") else: sorts.append(column_alias + " IS NULL") sorts.append(column_alias) primary_doc_details = Data() # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED for nested_path, sub_table in self.nested_tables.items(): nested_doc_details = { "sub_table": sub_table, "children": [], "index_to_column": {}, "nested_path": [nested_path ] # fake the real nested path, we only look at [0] anyway } # INSERT INTO TREE if not primary_doc_details: primary_doc_details = nested_doc_details else: def place(parent_doc_details): if startswith_field(nested_path, parent_doc_details['nested_path'][0]): for c in parent_doc_details['children']: if place(c): return True parent_doc_details['children'].append( nested_doc_details) place(primary_doc_details) alias = nested_doc_details['alias'] = nest_to_alias[nested_path] # WE ALWAYS ADD THE UID AND ORDER column_number = index_to_uid[nested_path] = nested_doc_details[ 'id_coord'] = len(sql_selects) sql_select = alias + "." + quoted_UID sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) if nested_path != ".": sql_select = alias + "." + quote_table(ORDER) sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM if nested_path not in active_columns: continue if primary_nested_path == nested_path: # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE si = 0 for s in listwrap(query.select): try: column_number = len(sql_selects) s.pull = get_column(column_number) db_columns = s.value.to_sql(self) if isinstance(s.value, LeavesOp): for column in db_columns: for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name( column_number) sql_selects.append(unsorted_sql + " AS " + column_alias) index_to_column[ column_number] = nested_doc_details[ 'index_to_column'][ column_number] = Data( push_name=concat_field( s.name, column.name), push_column=si, push_child=".", pull=get_column( column_number), sql=unsorted_sql, type=json_type, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) si += 1 else: for column in db_columns: for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name( column_number) sql_selects.append(unsorted_sql + " AS " + column_alias) index_to_column[ column_number] = nested_doc_details[ 'index_to_column'][ column_number] = Data( push_name=s.name, push_column=si, push_child=column.name, pull=get_column( column_number), sql=unsorted_sql, type=json_type, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) finally: si += 1 elif startswith_field(nested_path, primary_nested_path): # ADD REQUIRED COLUMNS, FOR DEEP STUFF for ci, c in enumerate(active_columns[nested_path]): if c.type in STRUCT: continue column_number = len(sql_selects) nested_path = c.nested_path unsorted_sql = nest_to_alias[ nested_path[0]] + "." + quote_table(c.es_column) column_alias = _make_column_name(column_number) sql_selects.append(unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details[ 'index_to_column'][column_number] = Data( push_name=s.name, push_column=si, push_child=relative_field(c.name, s.name), pull=get_column(column_number), sql=unsorted_sql, type=c.type, nested_path=nested_path) where_clause = query.where.to_sql(self, boolean=True)[0].sql.b unsorted_sql = self._make_sql_for_one_nest_in_set_op( ".", sql_selects, where_clause, active_columns, index_to_column) for n, _ in self.nested_tables.items(): sorts.append(COLUMN + unicode(index_to_uid[n])) ordered_sql = ("SELECT * FROM (\n" + unsorted_sql + "\n)" + "\nORDER BY\n" + ",\n".join(sorts) + "\nLIMIT " + quote_value(query.limit)) result = self.db.query(ordered_sql) def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Data() output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append(row) # UNDO output = unwraplist(output) return output if output else None if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Data() curr_nested_path = nested_doc_details['nested_path'][0] if isinstance(query.select, list) or isinstance( query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES for i, c in nested_doc_details[ 'index_to_column'].items(): value = row[i] if value == None: continue if value == '': continue relative_path = relative_field( concat_field(c.push_name, c.push_child), curr_nested_path) if relative_path == ".": doc = value else: doc[relative_path] = value else: # ASSIGN INNER PROPERTIES for i, c in nested_doc_details[ 'index_to_column'].items(): value = row[i] if value is not None: relative_path = relative_field( c.push_child, curr_nested_path) if relative_path == ".": doc = value else: doc[relative_path] = value output.append(doc) # ASSIGN NESTED ARRAYS for child_details in nested_doc_details['children']: child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested( rows, row, child_details, doc_id, id_coord) if nested_value is not None: path = child_details['nested_path'][0] doc[path] = nested_value try: row = rows.pop() except IndexError: output = unwraplist(output) return output if output else None cols = tuple(index_to_column.values()) if query.format == "cube": num_rows = len(result.data) num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0 map_index_to_name = {c.push_column: c.push_name for c in cols} temp_data = [[None] * num_rows for _ in range(num_cols)] for rownum, d in enumerate(result.data): for c in cols: if c.push_child == ".": temp_data[c.push_column][rownum] = c.pull(d) else: column = temp_data[c.push_column][rownum] if column is None: column = temp_data[c.push_column][rownum] = {} column[c.push_child] = c.pull(d) output = Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }]) return output elif query.format == "table": num_column = MAX([c.push_column for c in cols]) + 1 header = [None] * num_column for c in cols: # header[c.push_column] = c.push_name sf = split_field(c.push_name) if len(sf) == 0: header[c.push_column] = "." elif len(sf) == 1: header[c.push_column] = sf[0] else: # TABLES ONLY USE THE FIRST-LEVEL PROPERTY NAMES # PUSH ALL DEEPER NAMES TO CHILD header[c.push_column] = sf[0] c.push_child = join_field(sf[1:] + split_field(c.push_child)) output_data = [] for d in result.data: row = [None] * num_column for c in cols: set_column(row, c.push_column, c.push_child, c.pull(d)) output_data.append(row) return Data(meta={"format": "table"}, header=header, data=output_data) else: rows = list(reversed(unwrap(result.data))) row = rows.pop() output = Data(meta={"format": "list"}, data=listwrap( _accumulate_nested(rows, row, primary_doc_details, None, None))) return output
def _groupby_op(self, query, frum): schema = self.sf.tables[join_field(split_field(frum)[1:])].schema index_to_column = {} nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } frum_path = split_field(frum) base_table = join_field(frum_path[0:1]) path = join_field(frum_path[1:]) tables = [] for n, a in nest_to_alias.items(): if startswith_field(path, n): tables.append({"nest": n, "alias": a}) tables = jx.sort(tables, {"value": {"length": "nest"}}) from_sql = join_field( [base_table] + split_field(tables[0].nest)) + " " + tables[0].alias previous = tables[0] for t in tables[1::]: from_sql += ("\nLEFT JOIN\n" + quote_table(concat_field(base_table, t.nest)) + " " + t.alias + " ON " + t.alias + "." + PARENT + " = " + previous.alias + "." + UID) selects = [] groupby = [] for i, e in enumerate(query.groupby): for s in e.value.to_sql(schema): column_number = len(selects) sql_type, sql = s.sql.items()[0] if sql == 'NULL' and not e.value.var in schema.keys(): Log.error("No such column {{var}}", var=e.value.var) column_alias = _make_column_name(column_number) groupby.append(sql) selects.append(sql + " AS " + column_alias) if s.nested_path == ".": select_name = s.name else: select_name = "." index_to_column[column_number] = ColumnMapping( is_edge=True, push_name=e.name, push_column_name=e.name.replace("\\.", "."), push_column=i, push_child=select_name, pull=get_column(column_number), sql=sql, column_alias=column_alias, type=sql_type_to_json_type[sql_type]) for i, s in enumerate(listwrap(query.select)): column_number = len(selects) sql_type, sql = s.value.to_sql(schema)[0].sql.items()[0] if sql == 'NULL' and not s.value.var in schema.keys(): Log.error("No such column {{var}}", var=s.value.var) if s.value == "." and s.aggregate == "count": selects.append("COUNT(1) AS " + quote_table(s.name)) else: selects.append(sql_aggs[s.aggregate] + "(" + sql + ") AS " + quote_table(s.name)) index_to_column[column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=i + len(query.groupby), push_child=".", pull=get_column(column_number), sql=sql, column_alias=quote_table(s.name), type=sql_type_to_json_type[sql_type]) for w in query.window: selects.append(self._window_op(self, query, w)) where = query.where.to_sql(schema)[0].sql.b command = "SELECT\n" + (",\n".join(selects)) + \ "\nFROM\n" + from_sql + \ "\nWHERE\n" + where + \ "\nGROUP BY\n" + ",\n".join(groupby) if query.sort: command += "\nORDER BY " + ",\n".join( "(" + sql[t] + ") IS NULL" + ",\n" + sql[t] + (" DESC" if s.sort == -1 else "") for s, sql in [(s, s.value.to_sql(schema)[0].sql) for s in query.sort] for t in "bns" if sql[t]) return command, index_to_column
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not startswith_field(query['from'], self.sf.fact): Log.error("Expecting table, or some nested table") frum, query['from'] = query['from'], self schema = self.sf.tables["."].schema if not query.groupby: query = QueryOp.wrap(query, schema) # TYPE CONFLICTS MUST NOW BE RESOLVED DURING # TYPE-SPECIFIC QUERY NORMALIZATION # vars_ = query.vars(exclude_select=True) # type_map = { # v: c.es_column # for v in vars_ # if v in self.columns and len([c for c in self.columns[v] if c.type != "nested"]) == 1 # for c in self.columns[v] # if c.type != "nested" # } # # sql_query = query.map(type_map) query = query new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_table(new_table) + " AS " else: create_table = "" if query.groupby and query.format != "cube": query = QueryOp.wrap(query, schema) op, index_to_columns = self._groupby_op(query, frum) command = create_table + op elif query.groupby: query.edges, query.groupby = query.groupby, query.edges query = QueryOp.wrap(query, schema) op, index_to_columns = self._edges_op(query, frum) command = create_table + op query.edges, query.groupby = query.groupby, query.edges elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, frum) command = create_table + op else: op = self._set_op(query, frum) return op result = self.db.query(command) if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): column_names = [None ] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap( s.pull(result.data[0])) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(data=unwrap(data), select=select, meta={"format": "cube"}) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain( partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif isinstance(e.value, TupleOp): pulls = jx.sort([ c for c in index_to_columns.values() if c.push_name == e.name ], "push_child").pull parts = [ tuple(p(d) for p in pulls) for d in result.data ] domain = SimpleSetDomain( partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append( Data(name=e.name, allowNulls=allowNulls, domain=domain)) data = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data[s.name] = Matrix(dims=dims, zeros=0) else: data[s.name] = Matrix(dims=dims) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()}) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain( partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif isinstance(e.value, TupleOp): pulls = jx.sort([ c for c in index_to_columns.values() if c.push_name == e.name ], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = zip(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} if query.sort[i].sort == -1: domain = SimpleSetDomain( partitions=wrap(sorted(parts, reverse=True))) else: domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append( Data(name=e.name, allowNulls=allowNulls, domain=domain)) data_cubes = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data_cubes[s.name] = Matrix(dims=dims, zeros=0) else: data_cubes[s.name] = Matrix(dims=dims) r2c = index_to_coordinate( dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull( row) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()}) elif query.format == "table" or (not query.format and query.groupby): column_names = [None ] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[ s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data(meta={"format": "table"}, header=column_names, data=data) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any( listwrap(query.select).aggregate): if isinstance(query.select, list): data = Data() for c in index_to_columns.values(): if c.push_child == ".": if data[c.push_name] == None: data[c.push_name] = c.pull(result.data[0]) elif isinstance(data[c.push_name], list): data[c.push_name].append(c.pull( result.data[0])) else: data[c.push_name] = [ data[c.push_name], c.pull(result.data[0]) ] else: data[c.push_name][c.push_child] = c.pull( result.data[0]) output = Data(meta={"format": "value"}, data=data) else: data = Data() for s in index_to_columns.values(): if data[s.push_child] == None: data[s.push_child] = s.pull(result.data[0]) else: data[s.push_child] += [s.pull(result.data[0])] output = Data(meta={"format": "value"}, data=unwrap(data)) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[ c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data(meta={"format": "list"}, data=data) else: Log.error("unknown format {{format}}", format=query.format) return output
def delete(self, where): filter = where.to_sql() self.db.execute("DELETE FROM " + quote_table(self.sf.fact) + " WHERE " + filter)
on_clause = edge_alias + "." + domain_name + " < " + edge_values[1][1] + " AND " + \ edge_values[0][1] + " < (" + edge_alias + "." + domain_name + " + " + text_type( d.interval) + ")" not_on_clause = None else: Log.error("do not know how to handle") # select_clause.extend(v[0] + " " + k for k, v in zip(domain_names, edge_values)) elif len(edge_names) > 1: domain_names = [ "d" + text_type(edge_index) + "c" + text_type(i) for i, _ in enumerate(edge_names) ] query_edge.allowNulls = False domain_columns = [ c for c in self.sf.columns if quote_table(c.es_column) in vals ] if not domain_columns: domain_nested_path = "." Log.note("expecting a known column") else: domain_nested_path = domain_columns[0].nested_path domain_table = quote_table( concat_field(self.sf.fact, domain_nested_path[0])) domain = ("\nSELECT " + ",\n".join(g + " AS " + n for n, g in zip(domain_names, vals)) + "\nFROM\n" + domain_table + " " + nest_to_alias["."] + "\nGROUP BY\n" + ",\n".join(vals)) limit = Math.min(query.limit, query_edge.domain.limit)
def _edges_op(self, query, frum): index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) outer_selects = [ ] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) frum_path = split_field(frum) base_table = join_field(frum_path[0:1]) path = join_field(frum_path[1:]) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } schema = self.sf.tables[relative_field(frum, self.sf.fact)].schema tables = [] for n, a in nest_to_alias.items(): if startswith_field(path, n): tables.append({"nest": n, "alias": a}) tables = jx.sort(tables, {"value": {"length": "nest"}}) from_sql = join_field( [base_table] + split_field(tables[0].nest)) + " " + tables[0].alias previous = tables[0] for t in tables[1::]: from_sql += ("\nLEFT JOIN\n" + quote_table(concat_field(base_table, t.nest)) + " " + t.alias + " ON " + t.alias + "." + PARENT + " = " + previous.alias + "." + UID) # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH ons = [] join_types = [] wheres = [] not_ons = ["__exists__ IS NULL"] groupby = [] not_groupby = [] orderby = [] domains = [] select_clause = [ "1 __exists__" # USED TO DISTINGUISH BETWEEN NULL-BECAUSE-LEFT-JOIN OR NULL-BECAUSE-NULL-VALUE ] for edge_index, query_edge in enumerate(query.edges): edge_alias = "e" + text_type(edge_index) if query_edge.value: edge_values = [ p for c in query_edge.value.to_sql(schema).sql for p in c.items() ] elif not query_edge.value and any( query_edge.domain.partitions.where): case = "CASE " for pp, p in enumerate(query_edge.domain.partitions): w = p.where.to_sql(schema)[0].sql.b t = quote_value(pp) case += " WHEN " + w + " THEN " + t case += " ELSE NULL END " # quote value with length of partitions edge_values = [("n", case)] elif query_edge.range: edge_values = query_edge.range.min.to_sql(schema)[0].sql.items( ) + query_edge.range.max.to_sql(schema)[0].sql.items()
def __init__(self, name, db=None, uid=GUID, exists=False, kwargs=None): """ :param name: NAME FOR THIS TABLE :param db: THE DB TO USE :param uid: THE UNIQUE INDEX FOR THIS TABLE :return: HANDLE FOR TABLE IN db """ global _config Container.__init__(self, frum=None) if db: self.db = db else: self.db = db = Sqlite() if not _config: from pyLibrary.queries.containers import config as _config if not _config.default: _config.default = {"type": "sqlite", "settings": {"db": db}} self.name = name self.uid = listwrap(uid) self._next_uid = 1 self._make_digits_table() self.uid_accessor = jx.get(self.uid) self.nested_tables = OrderedDict( ) # MAP FROM NESTED PATH TO Table OBJECT, PARENTS PROCEED CHILDREN self.nested_tables["."] = self self.columns = Index( keys=[join_field(["names", self.name])] ) # MAP FROM DOCUMENT ABS PROPERTY NAME TO THE SET OF SQL COLUMNS IT REPRESENTS (ONE FOR EACH REALIZED DATATYPE) if not exists: for u in self.uid: if u == GUID: pass else: c = Column(names={name: u}, type="string", es_column=typed_column(u, "string"), es_index=name) self.add_column_to_schema(self.nested_tables, c) command = ("CREATE TABLE " + quote_table(name) + "(" + (",".join([quoted_UID + " INTEGER"] + [ _quote_column(c) + " " + sql_types[c.type] for u, cs in self.columns.items() for c in cs ])) + ", PRIMARY KEY (" + (", ".join([quoted_UID] + [ _quote_column(c) for u in self.uid for c in self.columns[u] ])) + "))") self.db.execute(command) else: # LOAD THE COLUMNS command = "PRAGMA table_info(" + quote_table(name) + ")" details = self.db.query(command) for r in details: cname = untyped_column(r[1]) ctype = r[2].lower() column = Column(names={name: cname}, type=ctype, nested_path=['.'], es_column=typed_column(cname, ctype), es_index=name) self.add_column_to_schema(self.columns, column)
def _nest_column(self, column, new_path): destination_table = concat_field(self.fact, new_path) existing_table = concat_field(self.fact, column.nested_path[0]) # FIND THE INNER COLUMNS WE WILL BE MOVING moving_columns = [] for c in self.columns: if destination_table != column.es_index and column.es_column == c.es_column: moving_columns.append(c) c.nested_path = [new_path] + c.nested_path # TODO: IF THERE ARE CHILD TABLES, WE MUST UPDATE THEIR RELATIONS TOO? # DEFINE A NEW TABLE? # LOAD THE COLUMNS command = "PRAGMA table_info(" + quote_table(destination_table) + ")" details = self.db.query(command) if not details.data: command = ("CREATE TABLE " + quote_table(destination_table) + "(" + (",".join([ quoted_UID + " INTEGER", quoted_PARENT + " INTEGER", quoted_ORDER + " INTEGER" ])) + ", PRIMARY KEY (" + quoted_UID + ")" + ", FOREIGN KEY (" + quoted_PARENT + ") REFERENCES " + quote_table(existing_table) + "(" + quoted_UID + ")" ")") self.db.execute(command) self.add_table_to_schema([new_path]) # TEST IF THERE IS ANY DATA IN THE NEW NESTED ARRAY if not moving_columns: return column.es_index = destination_table self.db.execute("ALTER TABLE " + quote_table(destination_table) + " ADD COLUMN " + quote_column(column.es_column) + " " + sql_types[column.type]) # Deleting parent columns for col in moving_columns: column = col.es_column tmp_table = "tmp_" + existing_table columns = self.db.query("select * from " + quote_table(existing_table) + " LIMIT 0").header self.db.execute("ALTER TABLE " + quote_table(existing_table) + " RENAME TO " + quote_table(tmp_table)) self.db.execute( "CREATE TABLE " + quote_table(existing_table) + " AS SELECT " + (", ".join([quote_table(c) for c in columns if c != column])) + " FROM " + quote_table(tmp_table)) self.db.execute("DROP TABLE " + quote_table(tmp_table))
def __del__(self): self.db.execute("DROP TABLE " + quote_table(self.name))
def __len__(self): counter = self.db.query("SELECT COUNT(*) FROM " + quote_table(self.sf.fact))[0][0] return counter
def __nonzero__(self): counter = self.db.query("SELECT COUNT(*) FROM " + quote_table(self.sf.fact))[0][0] return bool(counter)
edge_values[0][1] + " < (" + edge_alias + "." + domain_name + " + " + unicode( d.interval) + ")" not_on_clause = None else: Log.error("do not know how to handle") # select_clause.extend(v[0] + " " + k for k, v in zip(domain_names, edge_values)) elif len(edge_names) > 1: domain_names = [ "d" + unicode(edge_index) + "c" + unicode(i) for i, _ in enumerate(edge_names) ] query_edge.allowNulls = False domain = ("\nSELECT " + ",\n".join(g + " AS " + n for n, g in zip(domain_names, vals)) + "\nFROM\n" + quote_table(self.name) + " " + nest_to_alias["."] + "\nGROUP BY\n" + ",\n".join(vals)) limit = Math.min(query.limit, query_edge.domain.limit) domain += ("\nORDER BY COUNT(1) DESC" + "\nLIMIT " + unicode(limit)) where = None join_type = "LEFT JOIN" if query_edge.allowNulls else "JOIN" on_clause = " AND ".join( "((" + edge_alias + "." + k + " IS NULL AND " + v + " IS NULL) OR " + edge_alias + "." + k + " = " + v + ")" for k, v in zip(domain_names, vals)) not_on_clause = None elif isinstance(query_edge.domain, DefaultDomain): domain_names = [ "d" + unicode(edge_index) + "c" + unicode(i)