def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ with self.locker: self._update_meta() output = [ { "table": c.es_index, "name": untyped_column(c.name)[0], "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "es_type": c.es_type, "type": c.jx_type, } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.jx_type not in STRUCT # and c.es_column != "_id" ] from jx_python.containers.list_usingPythonList import ListContainer return ListContainer( self.name, data=output, schema=jx_base.Schema(META_COLUMNS_NAME, SIMPLE_METADATA_COLUMNS), )
def _nest_column(self, column): new_path, type_ = untyped_column(column.es_column) if type_ != SQL_NESTED_TYPE: Log.error("only nested types can be nested") destination_table = concat_field(self.fact_name, new_path) existing_table = concat_field(self.fact_name, column.nested_path[0]) # FIND THE INNER COLUMNS WE WILL BE MOVING moving_columns = [] for c in self.columns: if destination_table != column.es_index and column.es_column == c.es_column: moving_columns.append(c) c.nested_path = new_path # TODO: IF THERE ARE CHILD TABLES, WE MUST UPDATE THEIR RELATIONS TOO? # LOAD THE COLUMNS data = self.namespace.db.about(destination_table) if not data: # DEFINE A NEW TABLE command = ( SQL_CREATE + quote_column(destination_table) + sql_iso(sql_list([ quoted_UID + "INTEGER", quoted_PARENT + "INTEGER", quoted_ORDER + "INTEGER", "PRIMARY KEY " + sql_iso(quoted_UID), "FOREIGN KEY " + sql_iso(quoted_PARENT) + " REFERENCES " + quote_column(existing_table) + sql_iso(quoted_UID) ])) ) with self.namespace.db.transaction() as t: t.execute(command) self.add_table([new_path]+column.nested_path) # TEST IF THERE IS ANY DATA IN THE NEW NESTED ARRAY if not moving_columns: return column.es_index = destination_table with self.namespace.db.transaction() as t: t.execute( "ALTER TABLE " + quote_column(destination_table) + " ADD COLUMN " + quote_column(column.es_column) + " " + column.es_type ) # Deleting parent columns for col in moving_columns: column = col.es_column tmp_table = "tmp_" + existing_table columns = list(map(text, t.query(SQL_SELECT + SQL_STAR + SQL_FROM + quote_column(existing_table) + SQL_LIMIT + SQL_ZERO).header)) t.execute( "ALTER TABLE " + quote_column(existing_table) + " RENAME TO " + quote_column(tmp_table) ) t.execute( SQL_CREATE + quote_column(existing_table) + SQL_AS + SQL_SELECT + sql_list([quote_column(c) for c in columns if c != column]) + SQL_FROM + quote_column(tmp_table) ) t.execute("DROP TABLE " + quote_column(tmp_table))
def __init__(self, db): self.db = db self._snowflakes = {} # MAP FROM BASE TABLE TO LIST OF NESTED TABLES self._columns = ColumnList() # FIND ALL TABLES result = self.db.query( "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name") tables = wrap([{k: d[i] for i, k in enumerate(result.header)} for d in result.data]) last_nested_path = [] for table in tables: if table.name.startswith("__"): continue base_table, nested_path = tail_field(table.name) # FIND COMMON NESTED PATH SUFFIX for i, p in enumerate(last_nested_path): if startswith_field(nested_path, p): last_nested_path = last_nested_path[i:] break else: last_nested_path = [] full_nested_path = [nested_path] + last_nested_path nested_tables = self._snowflakes.setdefault( base_table, [nested_path] + last_nested_path) nested_tables.append( jx_base.TableDesc(name=table.name, nested_path=full_nested_path)) # LOAD THE COLUMNS command = "PRAGMA table_info" + sql_iso(quote_column(table.name)) details = self.db.query(command) for cid, name, dtype, notnull, dfft_value, pk in details.data: if name.startswith("__"): continue cname, ctype = untyped_column(name) self._columns.add( Column( name=cname, # I THINK COLUMNS HAVE THIER FULL PATH jx_type=coalesce( ctype, { "TEXT": "string", "REAL": "number", "INTEGER": "integer" }.get(dtype)), nested_path=full_nested_path, es_type=dtype, es_column=name, es_index=table.name)) last_nested_path = full_nested_path
def read_db(self): """ PULL SCHEMA FROM DATABASE, BUILD THE MODEL :return: None """ # FIND ALL TABLES result = self.db.query( "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name") tables = wrap([{k: d[i] for i, k in enumerate(result.header)} for d in result.data]) tables_found = False for table in tables: if table.name.startswith("__"): continue tables_found = True nested_path = [ join_field(split_field(tab.name)[1:]) for tab in jx.reverse(tables) if startswith_field(table.name, tab.name) ] self.add_table_to_schema(nested_path) # LOAD THE COLUMNS command = "PRAGMA table_info" + sql_iso(quote_column(table.name)) details = self.db.query(command) for cid, name, dtype, notnull, dfft_value, pk in details.data: if name.startswith("__"): continue cname, ctype = untyped_column(name) column = Column(names={ np: relative_field(cname, np) for np in nested_path }, type=coalesce( ctype, { "TEXT": "string", "REAL": "number", "INTEGER": "integer" }.get(dtype)), nested_path=nested_path, es_column=name, es_index=table.name) self.add_column_to_schema(column) return tables_found
def _load_from_database(self): # FIND ALL TABLES result = self.db.query(sql_query({ "from": "sqlite_master", "where": {"eq": {"type": "table"}}, "orderby": "name" })) tables = wrap([{k: d for k, d in zip(result.header, row)} for row in result.data]) last_nested_path = ["."] for table in tables: if table.name.startswith("__"): continue base_table, nested_path = tail_field(table.name) # FIND COMMON NESTED PATH SUFFIX if nested_path == ".": last_nested_path = [] else: for i, p in enumerate(last_nested_path): if startswith_field(nested_path, p): last_nested_path = last_nested_path[i:] break else: last_nested_path = [] full_nested_path = [nested_path] + last_nested_path self._snowflakes[literal_field(base_table)] += [full_nested_path] # LOAD THE COLUMNS details = self.db.about(table.name) for cid, name, dtype, notnull, dfft_value, pk in details: if name.startswith("__"): continue cname, ctype = untyped_column(name) self.add(Column( name=cname, jx_type=coalesce(sql_type_to_json_type.get(ctype), IS_NULL), nested_path=full_nested_path, es_type=dtype, es_column=name, es_index=table.name, last_updated=Date.now() )) last_nested_path = full_nested_path
def _load_from_database(self): # FIND ALL TABLES result = self.db.query( "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name") tables = wrap([{k: d for k, d in zip(result.header, row)} for row in result.data]) last_nested_path = [] for table in tables: if table.name.startswith("__"): continue base_table, nested_path = tail_field(table.name) # FIND COMMON NESTED PATH SUFFIX for i, p in enumerate(last_nested_path): if startswith_field(nested_path, p): last_nested_path = last_nested_path[i:] break else: last_nested_path = [] full_nested_path = [nested_path] + last_nested_path self._snowflakes[literal_field(base_table)] += [full_nested_path] # LOAD THE COLUMNS command = "PRAGMA table_info" + sql_iso(quote_column(table.name)) details = self.db.query(command) for cid, name, dtype, notnull, dfft_value, pk in details.data: if name.startswith("__"): continue cname, ctype = untyped_column(name) self.add( Column(name=cname, jx_type=coalesce(sql_type_to_json_type.get(ctype), IS_NULL), nested_path=full_nested_path, es_type=dtype, es_column=name, es_index=table.name, last_updated=Date.now())) last_nested_path = full_nested_path
outer_selects.append(sql) index_to_column[column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=".", pull=get_column(column_number), sql=sql, column_alias=quote_column(s.name), type=sql_type_to_json_type["n"]) elif s.aggregate == "count" and (not query.edges and not query.groupby): value = s.value.var columns = [ c.es_column for c in self.snowflake.columns if untyped_column(c.es_column)[0] == value ] sql = SQL("+").join( sql_count(quote_column(col)) for col in columns) column_number = len(outer_selects) outer_selects.append( sql_alias(sql, _make_column_name(column_number))) index_to_column[column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=".", pull=get_column(column_number), sql=sql, column_alias=_make_column_name(column_number), type=sql_type_to_json_type["n"])
def query_metadata(self, query): frum, query['from'] = query['from'], self schema = self.sf.tables["."].schema query = QueryOp.wrap(query, schema) columns = self.sf.columns where = query.where table_name = None column_name = None if query.edges or query.groupby: Log.error("Aggregates(groupby or edge) are not supported") if where.op == "eq" and where.lhs.var == "table": table_name = mo_json.json2value(where.rhs.json) elif where.op == "eq" and where.lhs.var == "name": column_name = mo_json.json2value(where.rhs.json) else: Log.error("Only simple filters are expected like: \"eq\" on table and column name") t = [i for i in columns[0].names.keys()] tables = [concat_field(self.sf.fact, i) for i in t] metadata = [] if columns[-1].es_column != GUID: columns.append(Column( names={i: relative_field(GUID, i) for i in t}, type="string", es_column=GUID, es_index=self.sf.fact, nested_path=["."] )) for tname, table in zip(t, tables): if table_name != None and table_name != table: continue for col in columns: cname, ctype = untyped_column(col.es_column) if column_name != None and column_name != cname: continue metadata.append((table, col.names[tname], col.type, unwraplist(col.nested_path))) if query.format == "cube": num_rows = len(metadata) header = ["table", "name", "type", "nested_path"] temp_data = dict(zip(header, zip(*metadata))) return Data( meta={"format": "cube"}, data=temp_data, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) elif query.format == "table": header = ["table", "name", "type", "nested_path"] return Data( meta={"format": "table"}, header=header, data=metadata ) else: header = ["table", "name", "type", "nested_path"] return Data( meta={"format": "list"}, data=[dict(zip(header, r)) for r in metadata] )
column_number = len(outer_selects) outer_selects.append(sql) index_to_column[column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=".", pull=get_column(column_number), sql=sql, column_alias=quote_column(s.name), type=sql_type_to_json_type["n"] ) elif s.aggregate == "count" and (not query.edges and not query.groupby): value = s.value.var columns = [c.es_column for c in self.snowflake.columns if untyped_column(c.es_column)[0] == value] sql = SQL("+").join(sql_count(quote_column(col)) for col in columns) column_number = len(outer_selects) outer_selects.append(sql_alias(sql, _make_column_name(column_number))) index_to_column[column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=".", pull=get_column(column_number), sql=sql, column_alias=_make_column_name(column_number), type=sql_type_to_json_type["n"] ) elif s.aggregate == "percentile": if not isinstance(s.percentile, (int, float)):
def _flatten(data, uid, parent_id, order, full_path, nested_path, row=None, guid=None): """ :param data: the data we are pulling apart :param uid: the uid we are giving this doc :param parent_id: the parent id of this (sub)doc :param order: the number of siblings before this one :param full_path: path to this (sub)doc :param nested_path: list of paths, deepest first :param row: we will be filling this :return: """ table = concat_field(self.name, nested_path[0]) insertion = doc_collection[nested_path[0]] if not row: row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order} insertion.rows.append(row) if isinstance(data, Mapping): items = ((concat_field(full_path, k), v) for k, v in wrap(data).leaves()) else: # PRIMITIVE VALUES items = [(full_path, data)] for cname, v in items: value_type = get_type(v) if value_type is None: continue if value_type == NESTED: c = unwraplist([ cc for cc in snowflake.columns if cc.jx_type in STRUCT and untyped_column(cc.name) == cname ]) else: c = unwraplist([ cc for cc in snowflake.columns if cc.jx_type == value_type and cc.name == cname ]) insertion = doc_collection[nested_path[0]] if not c: # WHAT IS THE NESTING LEVEL FOR THIS PATH? deeper_nested_path = "." for path in snowflake.query_paths: if startswith_field(cname, path[0]) and len( deeper_nested_path) < len(path): deeper_nested_path = path c = Column(name=cname, jx_type=value_type, es_type=json_type_to_sqlite_type.get( value_type, value_type), es_column=typed_column( cname, json_type_to_sql_type.get(value_type)), es_index=table, nested_path=nested_path, last_updated=Date.now()) if value_type == "nested": snowflake.query_paths.append(c.es_column) required_changes.append({'nest': (c, nested_path)}) else: snowflake.columns.append(c) required_changes.append({"add": c}) # INSIDE IF BLOCK BECAUSE WE DO NOT WANT IT TO ADD WHAT WE columns.get() ALREADY insertion.active_columns.add(c) elif c.jx_type == "nested" and value_type == "object": value_type = "nested" v = [v] elif len(c.nested_path) < len(nested_path): from_doc = doc_collection.get(c.nested_path[0], None) column = c.es_column from_doc.active_columns.remove(c) snowflake._remove_column(c) required_changes.append({"nest": (c, nested_path)}) deep_c = Column(name=cname, jx_type=value_type, es_type=json_type_to_sqlite_type.get( value_type, value_type), es_column=typed_column( cname, json_type_to_sql_type.get(value_type)), es_index=table, nested_path=nested_path, last_updated=Date.now()) snowflake._add_column(deep_c) snowflake._drop_column(c) from_doc.active_columns.remove(c) for r in from_doc.rows: r1 = unwrap(r) if column in r1: row1 = { UID: self.container.next_uid(), PARENT: r1["__id__"], ORDER: 0, column: r1[column] } insertion.rows.append(row1) elif len(c.nested_path) > len(nested_path): insertion = doc_collection[c.nested_path[0]] row = { UID: self.container.next_uid(), PARENT: uid, ORDER: order } insertion.rows.append(row) # BE SURE TO NEST VALUES, IF NEEDED if value_type == "nested": row[c.es_column] = "." deeper_nested_path = [cname] + nested_path insertion = doc_collection.get(cname, None) if not insertion: insertion = doc_collection[cname] = Data( active_columns=set(), rows=[]) for i, r in enumerate(v): child_uid = self.container.next_uid() _flatten(r, child_uid, uid, i, cname, deeper_nested_path) elif value_type == "object": row[c.es_column] = "." _flatten(v, uid, parent_id, order, cname, nested_path, row=row) elif c.jx_type: insertion.active_columns.add(c) row[c.es_column] = v
def __init__(self, name, db=None, uid=GUID, exists=False, kwargs=None): """ :param name: NAME FOR THIS TABLE :param db: THE DB TO USE :param uid: THE UNIQUE INDEX FOR THIS TABLE :return: HANDLE FOR TABLE IN db """ global _config Container.__init__(self, frum=None) if db: self.db = db else: self.db = db = Sqlite() if not _config: from pyLibrary.queries.containers import config as _config if not _config.default: _config.default = {"type": "sqlite", "settings": {"db": db}} self.name = name self.uid = listwrap(uid) self._next_uid = 1 self._make_digits_table() self.uid_accessor = jx.get(self.uid) self.nested_tables = OrderedDict( ) # MAP FROM NESTED PATH TO Table OBJECT, PARENTS PROCEED CHILDREN self.nested_tables["."] = self self.columns = Index( keys=[join_field(["names", self.name])] ) # MAP FROM DOCUMENT ABS PROPERTY NAME TO THE SET OF SQL COLUMNS IT REPRESENTS (ONE FOR EACH REALIZED DATATYPE) if not exists: for u in self.uid: if u == GUID: pass else: c = Column(names={name: u}, type="string", es_column=typed_column(u, "string"), es_index=name) self.add_column_to_schema(self.nested_tables, c) command = ("CREATE TABLE " + quote_table(name) + "(" + (",".join([quoted_UID + " INTEGER"] + [ _quote_column(c) + " " + sql_types[c.type] for u, cs in self.columns.items() for c in cs ])) + ", PRIMARY KEY (" + (", ".join([quoted_UID] + [ _quote_column(c) for u in self.uid for c in self.columns[u] ])) + "))") self.db.execute(command) else: # LOAD THE COLUMNS command = "PRAGMA table_info(" + quote_table(name) + ")" details = self.db.query(command) for r in details: cname = untyped_column(r[1]) ctype = r[2].lower() column = Column(names={name: cname}, type=ctype, nested_path=['.'], es_column=typed_column(cname, ctype), es_index=name) self.add_column_to_schema(self.columns, column)
def column(self, prefix): full_name = untyped_column(concat_field(self.nested_path, prefix)) return set(c for c in self.snowflake.namespace.columns.find( self.snowflake.fact_name) for k, t in [untyped_column(c.name)] if k == full_name and k != GUID if c.jx_type not in [OBJECT, EXISTS])