def doc_to_column(doc): try: doc = wrap(untyped(doc)) # I HAVE MANAGED TO MAKE MANY MISTAKES WRITING COLUMNS TO ES. HERE ARE THE FIXES # FIX if not doc.last_updated: doc.last_updated = Date.now() - YEAR # FIX if doc.es_type == None: if doc.jx_type == OBJECT: doc.es_type = "object" else: Log.warning("{{doc}} has no es_type", doc=doc) # FIX doc.multi = 1001 if doc.es_type == "nested" else doc.multi # FIX doc.nested_path = tuple(listwrap(doc.nested_path)) if last(split_field( doc.es_column)) == NESTED_TYPE and doc.es_type != "nested": doc.es_type = "nested" doc.jx_type = NESTED doc.multi = 1001 doc.last_updated = Date.now() # FIX expected_nested_path = get_nested_path(doc.es_column) if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.': doc.nested_path = doc.nested_path[:-1] # FIX if untype_path(doc.es_column) == doc.es_column: if doc.nested_path != (".", ): if doc.es_index in {"repo"}: pass else: Log.note("not expected") doc.nested_path = expected_nested_path else: if doc.nested_path != expected_nested_path: doc.nested_path = expected_nested_path # FIX if last(split_field(doc.es_column)) == EXISTS_TYPE: doc.jx_type = EXISTS return Column(**doc) except Exception: doc.nested_path = ["."] mark_as_deleted(Column(**doc)) return None
def row_to_column(header, row): return Column( **{ h: c if c is None or h not in ("nested_path", "partitions") else json2value(c) for h, c in zip(header, row) })
def get_or_create_facts(self, fact_name, uid=UID): """ FIND TABLE BY NAME, OR CREATE IT IF IT DOES NOT EXIST :param fact_name: NAME FOR THE CENTRAL INDEX :param uid: name, or list of names, for the GUID :return: Facts """ about = self.db.about(fact_name) if not about: if uid != UID: Log.error("do not know how to handle yet") self.ns.columns._snowflakes[fact_name] = ["."] self.ns.columns.add(Column( name="_id", es_column="_id", es_index=fact_name, es_type=json_type_to_sqlite_type[STRING], jx_type=STRING, nested_path=['.'], multi=1, last_updated=Date.now() )) command = sql_create(fact_name, {UID: "INTEGER PRIMARY KEY", GUID: "TEXT"}, unique=UID) with self.db.transaction() as t: t.execute(command) return QueryTable(fact_name, self)
def select(self, select): selects = listwrap(select) if len(selects) == 1 and is_op(selects[0].value, Variable) and selects[0].value.var == ".": new_schema = self.schema if selects[0].name == ".": return self else: new_schema = None if is_list(select): if all( is_op(s.value, Variable) and s.name == s.value.var for s in select ): names = set(s.value.var for s in select) new_schema = Schema(".", [c for c in self.schema.columns if c.name in names]) push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def selector(d): output = Data() for n, p in push_and_pull: output[n] = unwraplist(p(to_data(d))) return unwrap(output) new_data = list(map(selector, self.data)) else: select_value = jx_expression_to_function(select.value) new_data = list(map(select_value, self.data)) if is_op(select.value, Variable): column = dict(**first(c for c in self.schema.columns if c.name == select.value.var)) column.update({"name": ".", "jx_type": NESTED, "es_type": "nested", "multi":1001, "cardinality":1}) new_schema = Schema("from " + self.name, [Column(**column)]) return ListContainer("from "+self.name, data=new_data, schema=new_schema)
def __init__(self, nested_path): if nested_path[-1] != ".": Log.error("Expecting full nested path") source = Column(name=".", jx_type=OBJECT, es_type=OBJECT, es_column="_source", es_index=nested_path, nested_path=nested_path) guid = Column(name=GUID, jx_type=STRING, es_type='TEXT', es_column=GUID, es_index=nested_path, nested_path=nested_path) self.namespace = {".": {source}, GUID: {guid}} self._columns = [source, guid] self.nested_path = nested_path
def __init__(self, db): self.db = db self._snowflakes = {} # MAP FROM BASE TABLE TO LIST OF NESTED TABLES self._columns = ColumnList() # FIND ALL TABLES result = self.db.query( "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name") tables = wrap([{k: d[i] for i, k in enumerate(result.header)} for d in result.data]) last_nested_path = [] for table in tables: if table.name.startswith("__"): continue base_table, nested_path = tail_field(table.name) # FIND COMMON NESTED PATH SUFFIX for i, p in enumerate(last_nested_path): if startswith_field(nested_path, p): last_nested_path = last_nested_path[i:] break else: last_nested_path = [] full_nested_path = [nested_path] + last_nested_path nested_tables = self._snowflakes.setdefault( base_table, [nested_path] + last_nested_path) nested_tables.append( jx_base.TableDesc(name=table.name, nested_path=full_nested_path)) # LOAD THE COLUMNS command = "PRAGMA table_info" + sql_iso(quote_column(table.name)) details = self.db.query(command) for cid, name, dtype, notnull, dfft_value, pk in details.data: if name.startswith("__"): continue cname, ctype = untyped_column(name) self._columns.add( Column( name=cname, # I THINK COLUMNS HAVE THIER FULL PATH jx_type=coalesce( ctype, { "TEXT": "string", "REAL": "number", "INTEGER": "integer" }.get(dtype)), nested_path=full_nested_path, es_type=dtype, es_column=name, es_index=table.name)) last_nested_path = full_nested_path
def _load_from_database(self): # FIND ALL TABLES result = self.db.query( sql_query({ "from": "sqlite_master", "where": { "eq": { "type": "table" } }, "orderby": "name" })) tables = wrap([{k: d for k, d in zip(result.header, row)} for row in result.data]) last_nested_path = ["."] for table in tables: if table.name.startswith("__"): continue base_table, nested_path = tail_field(table.name) # FIND COMMON NESTED PATH SUFFIX if nested_path == ".": last_nested_path = [] else: for i, p in enumerate(last_nested_path): if startswith_field(nested_path, p): last_nested_path = last_nested_path[i:] break else: last_nested_path = [] full_nested_path = [nested_path] + last_nested_path self._snowflakes[literal_field(base_table)] += [full_nested_path] # LOAD THE COLUMNS details = self.db.about(table.name) for cid, name, dtype, notnull, dfft_value, pk in details: if name.startswith("__"): continue cname, ctype = untyped_column(name) self.add( Column(name=cname, jx_type=coalesce(sql_type_to_json_type.get(ctype), IS_NULL), nested_path=full_nested_path, es_type=dtype, es_column=name, es_index=table.name, last_updated=Date.now())) last_nested_path = full_nested_path
def doc_to_column(doc): try: doc = wrap(untyped(doc)) if not doc.last_updated: doc.last_updated = Date.now() - YEAR if doc.es_type == None: if doc.jx_type == OBJECT: doc.es_type = "object" else: Log.warning("{{doc}} has no es_type", doc=doc) doc.multi = 1001 if doc.es_type == "nested" else doc.multi doc.nested_path = tuple(listwrap(doc.nested_path)) if last(split_field( doc.es_column)) == NESTED_TYPE and doc.es_type != "nested": doc.es_type = "nested" doc.jx_type = NESTED doc.multi = 1001 doc.last_updated = Date.now() expected_nested_path = get_nested_path(doc.es_column) if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.': doc.nested_path = doc.nested_path[:-1] if untype_path(doc.es_column) == doc.es_column: if doc.nested_path != (".", ): if doc.es_index in {"repo"}: pass else: Log.note("not expected") doc.nested_path = expected_nested_path else: if doc.nested_path != expected_nested_path: doc.nested_path = expected_nested_path return Column(**doc) except Exception: doc.nested_path = ["."] mark_as_deleted(Column(**doc)) return None
def _get_schema_from_list(frum, table_name, prefix_path, nested_path, columns): """ :param frum: The list :param table_name: Name of the table this list holds records for :param prefix_path: parent path :param nested_path: each nested array, in reverse order :param columns: map from full name to column definition :return: """ for d in frum: row_type = _type_to_name[d.__class__] if row_type != "object": full_name = join_field(prefix_path) column = columns[full_name] if not column: column = Column(names={table_name: full_name}, es_column=full_name, es_index=".", type="undefined", nested_path=nested_path) columns.add(column) column.type = _merge_type[column.type][row_type] else: for name, value in d.items(): full_name = join_field(prefix_path + [name]) column = columns[full_name] if not column: column = Column(names={table_name: full_name}, es_column=full_name, es_index=".", type="undefined", nested_path=nested_path) columns.add(column) if isinstance(value, list): if len(value) == 0: this_type = "undefined" elif len(value) == 1: this_type = _type_to_name[value[0].__class__] else: this_type = _type_to_name[value[0].__class__] if this_type == "object": this_type = "nested" else: this_type = _type_to_name[value.__class__] new_type = _merge_type[column.type][this_type] column.type = new_type if this_type == "object": _get_schema_from_list([value], table_name, prefix_path + [name], nested_path, columns) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist( [join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list(value, table_name, prefix_path + [name], newpath, columns)
def test_column_constraints(self): multi = Column( name="name", es_column="es_column.~N~", es_index="es_index", es_type="nested", jx_type=NESTED, cardinality=1, multi=2, nested_path=".", last_updated=Date.now(), ) self.assertRaises( Exception, Column, name="name", es_column="es_column.~N~", es_index="es_index", es_type="es_type", jx_type=INTEGER, multi=1, nested_path=".", last_updated=Date.now(), ) self.assertRaises( Exception, Column, name="name", es_column="es_column.~N~", es_index="es_index", es_type="es_type", jx_type=INTEGER, multi=0, nested_path=".", last_updated=Date.now(), ) self.assertRaises( Exception, Column, name="name", es_column="es_column.~N~", es_index="es_index", es_type="es_type", jx_type=INTEGER, nested_path=".", last_updated=Date.now(), )
def test_change_column_property(self): row = Column( name="name", es_column="es_column.~N~", es_index="es_index", es_type="nested", jx_type=NESTED_TYPE, multi=1001, nested_path=["."], last_updated=Date.now(), ) def set_bad_multi(): row.multi = None self.assertRaises(Exception, set_bad_multi)
def _load_from_database(self): # FIND ALL TABLES result = self.db.query( "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name") tables = wrap([{k: d for k, d in zip(result.header, row)} for row in result.data]) last_nested_path = [] for table in tables: if table.name.startswith("__"): continue base_table, nested_path = tail_field(table.name) # FIND COMMON NESTED PATH SUFFIX for i, p in enumerate(last_nested_path): if startswith_field(nested_path, p): last_nested_path = last_nested_path[i:] break else: last_nested_path = [] full_nested_path = [nested_path] + last_nested_path self._snowflakes[literal_field(base_table)] += [full_nested_path] # LOAD THE COLUMNS command = "PRAGMA table_info" + sql_iso(quote_column(table.name)) details = self.db.query(command) for cid, name, dtype, notnull, dfft_value, pk in details.data: if name.startswith("__"): continue cname, ctype = untyped_column(name) self.add( Column(name=cname, jx_type=coalesce(sql_type_to_json_type.get(ctype), IS_NULL), nested_path=full_nested_path, es_type=dtype, es_column=name, es_index=table.name, last_updated=Date.now())) last_nested_path = full_nested_path
def create_snowflake(self, fact_name, uid=UID): """ MAKE NEW TABLE WITH GIVEN guid :param fact_name: NAME FOR THE CENTRAL FACTS :param uid: name, or list of names, for the GUID :return: Facts """ self.add_table_to_schema(["."]) uid = listwrap(uid) new_columns = [] for u in uid: if u == UID: pass else: c = Column( name=u, jx_type=STRING, es_column=typed_column(u, "string"), es_index=fact_name ) self.add_column_to_schema(c) new_columns.append(c) command = ( "CREATE TABLE " + quote_column(fact_name) + sql_iso(sql_list( [quoted_GUID + " TEXT "] + [quoted_UID + " INTEGER"] + [quote_column(c.es_column) + " " + json_type_to_sqlite_type[c.jx_type] for c in self.tables["."].schema.columns] + ["PRIMARY KEY " + sql_iso(sql_list( [quoted_GUID] + [quoted_UID] + [quote_column(c.es_column) for c in self.tables["."].schema.columns] ))] )) ) self.db.execute(command) snowflake = Snowflake(fact_name, self) return Facts(self, snowflake)
def create_or_replace_facts(self, fact_name, uid=UID): """ MAKE NEW TABLE WITH GIVEN guid :param fact_name: NAME FOR THE CENTRAL FACTS :param uid: name, or list of names, for the GUID :return: Facts """ self.remove_snowflake(fact_name) self._snowflakes[fact_name] = ["."] uid = listwrap(uid) new_columns = [] for u in uid: if u == UID: pass else: c = Column(name=u, jx_type=mo_json.STRING, es_column=typed_column( u, json_type_to_sql_type[mo_json.STRING]), es_type=json_type_to_sqlite_type[mo_json.STRING], es_index=fact_name, last_updated=Date.now()) self.add_column_to_schema(c) new_columns.append(c) command = ("CREATE TABLE " + quote_column(fact_name) + sql_iso( sql_list([quoted_GUID + " TEXT "] + [quoted_UID + " INTEGER"] + [ quote_column(c.es_column) + " " + c.es_type for c in new_columns ] + [ "PRIMARY KEY " + sql_iso( sql_list([quoted_GUID] + [quoted_UID] + [quote_column(c.es_column) for c in new_columns])) ]))) with self.db.transaction() as t: t.execute(command) snowflake = Snowflake(fact_name, self) return Facts(self, snowflake)
def __init__( self, host, index, type=None, name=None, port=9200, read_only=True, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) wait_for_active_shards=1, # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency) typed=None, kwargs=None ): Container.__init__(self) if not container.config.default: container.config.default = { "type": "elasticsearch", "settings": unwrap(kwargs) } self.settings = kwargs self.name = name = coalesce(name, index) if read_only: self.es = elasticsearch.Alias(alias=index, kwargs=kwargs) else: self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs) self._namespace = ElasticsearchMetadata(kwargs=kwargs) self.settings.type = self.es.settings.type self.edges = Data() self.worker = None columns = self.snowflake.columns # ABSOLUTE COLUMNS is_typed = any(c.es_column == EXISTS_TYPE for c in columns) if typed == None: # SWITCH ON TYPED MODE self.typed = is_typed else: if is_typed != typed: Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed) self.typed = typed if not typed: # ADD EXISTENCE COLUMNS all_paths = {".": None} # MAP FROM path TO parent TO MAKE A TREE def nested_path_of(v): if not v: return [] else: return [v] + nested_path_of(all_paths[v]) all = sort_using_key(set(step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p))) for step in sorted(all): if step in all_paths: continue else: best = '.' for candidate in all_paths.keys(): if startswith_field(step, candidate): if startswith_field(candidate, best): best = candidate all_paths[step] = best for p in all_paths.keys(): nested_path = nested_path_of(all_paths[p]) if not nested_path: nested_path = ['.'] self.namespace.meta.columns.add(Column( name=p, es_column=p, es_index=self.name, es_type=OBJECT, jx_type=EXISTS, nested_path=nested_path, last_updated=Date.now() ))
def doc_to_column(doc): now = Date.now() try: doc = to_data(untyped(doc)) # I HAVE MANAGED TO MAKE MANY MISTAKES WRITING COLUMNS TO ES. HERE ARE THE FIXES # FIX if not doc.last_updated: doc.last_updated = Date.now() - YEAR # FIX if doc.es_type == None: if doc.jx_type == OBJECT: doc.es_type = "object" else: Log.warning("{{doc}} has no es_type", doc=doc) # FIX if doc.es_type == "nested": doc.multi = 1001 if doc.multi == None: doc.multi = 1 # FIX if doc.es_column.endswith("." + NESTED_TYPE): if doc.jx_type == OBJECT: doc.jx_type = NESTED doc.last_updated = now if doc.es_type == "nested": doc.es_type = "nested" doc.last_updated = now # FIX doc.nested_path = tuple(listwrap(doc.nested_path)) if last(split_field( doc.es_column)) == NESTED_TYPE and doc.es_type != "nested": doc.es_type = "nested" doc.jx_type = NESTED doc.multi = 1001 doc.last_updated = now # FIX expected_nested_path = get_nested_path(doc.es_column) if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.': doc.nested_path = doc.nested_path[:-1] doc.last_updated = now # FIX if untype_path(doc.es_column) == doc.es_column: if doc.nested_path != (".", ): if doc.es_index in {"repo"}: pass else: Log.note("not expected") doc.nested_path = expected_nested_path doc.last_updated = now else: if doc.nested_path != expected_nested_path: doc.nested_path = expected_nested_path doc.last_updated = now # FIX if last(split_field(doc.es_column)) == EXISTS_TYPE: if doc.jx_type != EXISTS: doc.jx_type = EXISTS doc.last_updated = now if doc.cardinality == None: doc.cardinality = 1 doc.last_updated = now # FIX if doc.jx_type in STRUCT: if doc.cardinality not in [0, 1]: doc.cardinality = 1 # DO NOT KNOW IF EXISTS OR NOT doc.last_updated = now return Column(**doc) except Exception as e: try: mark_as_deleted(Column(**doc), now) except Exception: pass return None
def update(self, command): """ :param command: EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT """ command = wrap(command) clear_columns = set(listwrap(command['clear'])) # REJECT DEEP UPDATES touched_columns = command.set.keys() | clear_columns for c in self.schema.columns: if c.name in touched_columns and len(c.nested_path) > 1: Log.error("Deep update not supported") # ADD NEW COLUMNS where = jx_expression(command.where) or TRUE _vars = where.vars() _map = { v: c.es_column for v in _vars for c in self.columns.get(v, Null) if c.jx_type not in STRUCT } where_sql = where.map(_map).to_sql(self.schema)[0].sql.b new_columns = set(command.set.keys()) - set( c.name for c in self.schema.columns) for new_column_name in new_columns: nested_value = command.set[new_column_name] ctype = get_jx_type(nested_value) column = Column(name=new_column_name, jx_type=ctype, es_index=self.name, es_type=json_type_to_sqlite_type(ctype), es_column=typed_column(new_column_name, ctype), last_updated=Date.now()) self.add_column(column) # UPDATE THE NESTED VALUES for nested_column_name, nested_value in command.set.items(): if get_jx_type(nested_value) == "nested": nested_table_name = concat_field(self.name, nested_column_name) nested_table = nested_tables[nested_column_name] self_primary_key = sql_list( quote_column(c.es_column) for u in self.uid for c in self.columns[u]) extra_key_name = UID + text(len(self.uid)) extra_key = [e for e in nested_table.columns[extra_key_name]][0] sql_command = ( SQL_DELETE + SQL_FROM + quote_column(nested_table.name) + SQL_WHERE + "EXISTS" + sql_iso(SQL_SELECT + SQL_ONE + SQL_FROM + sql_alias(quote_column(nested_table.name), "n") + SQL_INNER_JOIN + sql_iso(SQL_SELECT + self_primary_key + SQL_FROM + quote_column(abs_schema.fact) + SQL_WHERE + where_sql) + " t ON " + SQL_AND.join( quote_column("t", c.es_column) + SQL_EQ + quote_column("n", c.es_column) for u in self.uid for c in self.columns[u]))) self.db.execute(sql_command) # INSERT NEW RECORDS if not nested_value: continue doc_collection = {} for d in listwrap(nested_value): nested_table.flatten(d, Data(), doc_collection, path=nested_column_name) prefix = SQL_INSERT + quote_column(nested_table.name) + sql_iso( sql_list([self_primary_key] + [quote_column(extra_key)] + [ quote_column(c.es_column) for c in doc_collection.get(".", Null).active_columns ])) # BUILD THE PARENT TABLES parent = (SQL_SELECT + self_primary_key + SQL_FROM + quote_column(abs_schema.fact) + SQL_WHERE + jx_expression(command.where).to_sql(schema)) # BUILD THE RECORDS children = SQL_UNION_ALL.join( SQL_SELECT + sql_alias(quote_value(i), extra_key.es_column) + SQL_COMMA + sql_list( sql_alias(quote_value(row[c.name]), quote_column(c.es_column)) for c in doc_collection.get(".", Null).active_columns) for i, row in enumerate( doc_collection.get(".", Null).rows)) sql_command = (prefix + SQL_SELECT + sql_list([ quote_column("p", c.es_column) for u in self.uid for c in self.columns[u] ] + [quote_column("c", extra_key)] + [ quote_column("c", c.es_column) for c in doc_collection.get(".", Null).active_columns ]) + SQL_FROM + sql_iso(parent) + " p" + SQL_INNER_JOIN + sql_iso(children) + " c" + SQL_ON + SQL_TRUE) self.db.execute(sql_command) # THE CHILD COLUMNS COULD HAVE EXPANDED # ADD COLUMNS TO SELF for n, cs in nested_table.columns.items(): for c in cs: column = Column(name=c.name, jx_type=c.jx_type, es_type=c.es_type, es_index=c.es_index, es_column=c.es_column, nested_path=[nested_column_name] + c.nested_path, last_updated=Date.now()) if c.name not in self.columns: self.columns[column.name] = {column} elif c.jx_type not in [ c.jx_type for c in self.columns[c.name] ]: self.columns[column.name].add(column) command = ConcatSQL( SQL_UPDATE, quote_column(self.name), SQL_SET, sql_list([ quote_column(c.es_column) + SQL_EQ + quote_value(get_if_type(v, c.jx_type)) for c in self.schema.columns if c.jx_type != NESTED and len(c.nested_path) == 1 for v in [command.set[c.name]] if v != None ] + [ quote_column(c.es_column) + SQL_EQ + SQL_NULL for c in self.schema.columns if (c.name in clear_columns and command.set[c.name] != None and c.jx_type != NESTED and len(c.nested_path) == 1) ]), SQL_WHERE, where_sql) with self.db.transaction() as t: t.execute(command)
def _flatten(data, uid, parent_id, order, full_path, nested_path, row=None, guid=None): """ :param data: the data we are pulling apart :param uid: the uid we are giving this doc :param parent_id: the parent id of this (sub)doc :param order: the number of siblings before this one :param full_path: path to this (sub)doc :param nested_path: list of paths, deepest first :param row: we will be filling this :return: """ table = concat_field(self.name, nested_path[0]) insertion = doc_collection[nested_path[0]] if not row: row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order} insertion.rows.append(row) if is_data(data): items = [(concat_field(full_path, k), v) for k, v in wrap(data).leaves()] else: # PRIMITIVE VALUES items = [(full_path, data)] for cname, v in items: jx_type = get_jx_type(v) if jx_type is None: continue insertion = doc_collection[nested_path[0]] if jx_type == NESTED: c = first(cc for cc in insertion.active_columns + snowflake.columns if cc.jx_type in STRUCT and untyped_column(cc.name)[0] == cname) else: c = first(cc for cc in insertion.active_columns + snowflake.columns if cc.jx_type == jx_type and cc.name == cname) if isinstance(c, list): Log.error("confused") if not c: # WHAT IS THE NESTING LEVEL FOR THIS PATH? deeper_nested_path = "." for path in snowflake.query_paths: if startswith_field(cname, path[0]) and len( deeper_nested_path) < len(path): deeper_nested_path = path c = Column(name=cname, jx_type=jx_type, es_type=json_type_to_sqlite_type.get( jx_type, jx_type), es_column=typed_column( cname, json_type_to_sql_type.get(jx_type)), es_index=table, cardinality=0, nested_path=nested_path, last_updated=Date.now()) if jx_type == NESTED: snowflake.query_paths.append(c.es_column) required_changes.append({'nest': c}) else: insertion.active_columns.add(c) required_changes.append({"add": c}) elif c.jx_type == NESTED and jx_type == OBJECT: # ALWAYS PROMOTE OBJECTS TO NESTED jx_type = NESTED v = [v] elif len(c.nested_path) < len(nested_path): from_doc = doc_collection.get(c.nested_path[0], None) column = c.es_column from_doc.active_columns.remove(c) snowflake._remove_column(c) required_changes.append({"nest": c}) deep_c = Column(name=cname, jx_type=jx_type, es_type=json_type_to_sqlite_type.get( jx_type, jx_type), es_column=typed_column( cname, json_type_to_sql_type.get(jx_type)), es_index=table, nested_path=nested_path, last_updated=Date.now()) snowflake._add_column(deep_c) snowflake._drop_column(c) from_doc.active_columns.remove(c) for r in from_doc.rows: r1 = unwrap(r) if column in r1: row1 = { UID: self.container.next_uid(), PARENT: r1["__id__"], ORDER: 0, column: r1[column] } insertion.rows.append(row1) elif len(c.nested_path) > len(nested_path): insertion = doc_collection[c.nested_path[0]] row = { UID: self.container.next_uid(), PARENT: uid, ORDER: order } insertion.rows.append(row) # BE SURE TO NEST VALUES, IF NEEDED if jx_type == NESTED: deeper_nested_path = [cname] + nested_path if not doc_collection.get(cname): doc_collection[cname] = Data(active_columns=Queue(), rows=[]) for i, r in enumerate(v): child_uid = self.container.next_uid() _flatten(r, child_uid, uid, i, cname, deeper_nested_path) elif jx_type == OBJECT: _flatten(v, uid, parent_id, order, cname, nested_path, row=row) elif c.jx_type: row[c.es_column] = v
) elif this_type in {"list", "FlatList"}: np = listwrap(nested_path) newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list( value, table_name, full_name, newpath, columns ) METADATA_COLUMNS = ( [ Column( name=c, es_index="meta.columns", es_column=c, es_type="keyword", jx_type=STRING, last_updated=Date.now(), nested_path=ROOT_PATH, ) for c in [ "name", "es_type", "jx_type", "nested_path", "es_column", "es_index", "partitions", ] ] + [
def _set_op(self, query): # GET LIST OF SELECTED COLUMNS vars_ = UNION([ v.var for select in listwrap(query.select) for v in select.value.vars() ]) schema = self.schema known_vars = schema.keys() active_columns = {".": set()} for v in vars_: for c in schema.leaves(v): nest = c.nested_path[0] active_columns.setdefault(nest, set()).add(c) # ANY VARS MENTIONED WITH NO COLUMNS? for v in vars_: if not any(startswith_field(cname, v) for cname in known_vars): active_columns["."].add( Column(name=v, jx_type=IS_NULL, es_column=".", es_index=".", es_type='NULL', nested_path=["."], last_updated=Date.now())) # EVERY COLUMN, AND THE INDEX IT TAKES UP index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) index_to_uid = {} # FROM NESTED PATH TO THE INDEX OF UID sql_selects = [ ] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) nest_to_alias = { nested_path[0]: "__" + unichr(ord('a') + i) + "__" for i, nested_path in enumerate(self.snowflake.query_paths) } sorts = [] if query.sort: for select in query.sort: col = SQLang[select.value].to_sql(schema)[0] for t, sql in col.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) sql_selects.append(sql_alias(sql, column_alias)) if select.sort == -1: sorts.append(quote_column(column_alias) + SQL_IS_NULL) sorts.append(quote_column(column_alias) + " DESC") else: sorts.append(quote_column(column_alias) + SQL_IS_NULL) sorts.append(quote_column(column_alias)) primary_doc_details = Data() # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED nested_path = [] for step, sub_table in self.snowflake.tables: nested_path.insert(0, step) nested_doc_details = { "sub_table": sub_table, "children": [], "index_to_column": {}, "nested_path": nested_path } # INSERT INTO TREE if not primary_doc_details: primary_doc_details = nested_doc_details else: def place(parent_doc_details): if startswith_field(step, parent_doc_details['nested_path'][0]): for c in parent_doc_details['children']: if place(c): return True parent_doc_details['children'].append( nested_doc_details) place(primary_doc_details) alias = nested_doc_details['alias'] = nest_to_alias[step] # WE ALWAYS ADD THE UID column_number = index_to_uid[step] = nested_doc_details[ 'id_coord'] = len(sql_selects) sql_select = quote_column(alias, UID) sql_selects.append( sql_alias(sql_select, _make_column_name(column_number))) if step != ".": # ID AND ORDER FOR CHILD TABLES index_to_column[column_number] = ColumnMapping( sql=sql_select, type="number", nested_path=nested_path, column_alias=_make_column_name(column_number)) column_number = len(sql_selects) sql_select = quote_column(alias, ORDER) sql_selects.append( sql_alias(sql_select, _make_column_name(column_number))) index_to_column[column_number] = ColumnMapping( sql=sql_select, type="number", nested_path=nested_path, column_alias=_make_column_name(column_number)) # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM if step not in active_columns: continue # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE si = 0 for select in listwrap(query.select): try: column_number = len(sql_selects) select.pull = get_column(column_number) db_columns = SQLang[select.value].partial_eval().to_sql( schema) for column in db_columns: for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) column_alias = _make_column_name(column_number) sql_selects.append( sql_alias(unsorted_sql, column_alias)) if startswith_field(schema.path, step) and is_op( select.value, LeavesOp): # ONLY FLATTEN primary_nested_path AND PARENTS, NOT CHILDREN index_to_column[ column_number] = nested_doc_details[ 'index_to_column'][ column_number] = ColumnMapping( push_name=literal_field( get_property_name( concat_field( select.name, column.name))), push_child=".", push_column_name= get_property_name( concat_field( select.name, column.name)), push_column=si, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=nested_path) si += 1 else: index_to_column[ column_number] = nested_doc_details[ 'index_to_column'][ column_number] = ColumnMapping( push_name=select.name, push_child=column.name, push_column_name=select.name, push_column=si, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=nested_path) finally: si += 1 where_clause = BooleanOp(query.where).partial_eval().to_sql( schema, boolean=True)[0].sql.b unsorted_sql = self._make_sql_for_one_nest_in_set_op( ".", sql_selects, where_clause, active_columns, index_to_column) for n, _ in self.snowflake.tables: sorts.append(quote_column(COLUMN + text(index_to_uid[n]))) ordered_sql = ConcatSQL( (SQL_SELECT, SQL_STAR, SQL_FROM, sql_iso(unsorted_sql), SQL_ORDERBY, sql_list(sorts), SQL_LIMIT, quote_value(query.limit))) result = self.db.query(ordered_sql) def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Null output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append( row ) # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc) return output if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Null curr_nested_path = nested_doc_details['nested_path'][0] index_to_column = nested_doc_details[ 'index_to_column'].items() for i, c in index_to_column: value = row[i] if is_list(query.select) or is_op( query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_field = concat_field( c.push_name, c.push_child) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_field = c.push_child if relative_field == ".": if exists(value): doc = value elif exists(value): if doc is Null: doc = Data() doc[relative_field] = value for child_details in nested_doc_details['children']: # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested( rows, row, child_details, doc_id, id_coord) if nested_value != None: push_name = child_details['nested_path'][0] if is_list(query.select) or is_op( query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_field = relative_field( push_name, curr_nested_path) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_field = "." if relative_field == ".": doc = unwraplist(nested_value) else: doc[relative_field] = unwraplist(nested_value) output.append(doc) try: row = rows.pop() except IndexError: return output cols = tuple( [i for i in index_to_column.values() if i.push_name != None]) rows = list(reversed(unwrap(result.data))) if rows: row = rows.pop() data = _accumulate_nested(rows, row, primary_doc_details, None, None) else: data = result.data if query.format == "cube": # for f, full_name in self.snowflake.tables: # if f != '.' or (test_dots(cols) and is_list(query.select)): # num_rows = len(result.data) # num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0 # map_index_to_name = {c.push_column: c.push_column_name for c in cols} # temp_data = [[None] * num_rows for _ in range(num_cols)] # for rownum, d in enumerate(result.data): # for c in cols: # if c.push_child == ".": # temp_data[c.push_column][rownum] = c.pull(d) # else: # column = temp_data[c.push_column][rownum] # if column is None: # column = temp_data[c.push_column][rownum] = {} # column[c.push_child] = c.pull(d) # output = Data( # meta={"format": "cube"}, # data={n: temp_data[c] for c, n in map_index_to_name.items()}, # edges=[{ # "name": "rownum", # "domain": { # "type": "rownum", # "min": 0, # "max": num_rows, # "interval": 1 # } # }] # ) # return output if is_list(query.select) or is_op(query.select.value, LeavesOp): num_rows = len(data) temp_data = { c.push_column_name: [None] * num_rows for c in cols } for rownum, d in enumerate(data): for c in cols: temp_data[c.push_column_name][rownum] = d[c.push_name] return Data(meta={"format": "cube"}, data=temp_data, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }]) else: num_rows = len(data) map_index_to_name = { c.push_column: c.push_column_name for c in cols } temp_data = [data] return Data(meta={"format": "cube"}, data={ n: temp_data[c] for c, n in map_index_to_name.items() }, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }]) elif query.format == "table": # for f, _ in self.snowflake.tables: # if frum.endswith(f): # num_column = MAX([c.push_column for c in cols]) + 1 # header = [None] * num_column # for c in cols: # header[c.push_column] = c.push_column_name # # output_data = [] # for d in result.data: # row = [None] * num_column # for c in cols: # set_column(row, c.push_column, c.push_child, c.pull(d)) # output_data.append(row) # # return Data( # meta={"format": "table"}, # header=header, # data=output_data # ) if is_list(query.select) or is_op(query.select.value, LeavesOp): column_names = [None] * (max(c.push_column for c in cols) + 1) for c in cols: column_names[c.push_column] = c.push_column_name temp_data = [] for rownum, d in enumerate(data): row = [None] * len(column_names) for c in cols: row[c.push_column] = d[c.push_name] temp_data.append(row) return Data(meta={"format": "table"}, header=column_names, data=temp_data) else: column_names = listwrap(query.select).name return Data(meta={"format": "table"}, header=column_names, data=[[d] for d in data]) else: # for f, _ in self.snowflake.tables: # if frum.endswith(f) or (test_dots(cols) and is_list(query.select)): # data = [] # for d in result.data: # row = Data() # for c in cols: # if c.push_child == ".": # row[c.push_name] = c.pull(d) # elif c.num_push_columns: # tuple_value = row[c.push_name] # if not tuple_value: # tuple_value = row[c.push_name] = [None] * c.num_push_columns # tuple_value[c.push_child] = c.pull(d) # else: # row[c.push_name][c.push_child] = c.pull(d) # # data.append(row) # # return Data( # meta={"format": "list"}, # data=data # ) if is_list(query.select) or is_op(query.select.value, LeavesOp): temp_data = [] for rownum, d in enumerate(data): row = {} for c in cols: row[c.push_column_name] = d[c.push_name] temp_data.append(row) return Data(meta={"format": "list"}, data=temp_data) else: return Data(meta={"format": "list"}, data=data)
def update(self, command): """ :param command: EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT """ command = wrap(command) # REJECT DEEP UPDATES touched_columns = command.set.keys() | set(listwrap(command['clear'])) for c in self.get_leaves(): if c.name in touched_columns and c.nested_path and len( c.name) > len(c.nested_path[0]): Log.error("Deep update not supported") # ADD NEW COLUMNS where = jx_expression(command.where) _vars = where.vars() _map = { v: c.es_column for v in _vars for c in self.columns.get(v, Null) if c.jx_type not in STRUCT } where_sql = where.map(_map).to_sql(schema) new_columns = set(command.set.keys()) - set(self.columns.keys()) for new_column_name in new_columns: nested_value = command.set[new_column_name] ctype = get_type(nested_value) column = Column(name=new_column_name, jx_type=ctype, es_index=self.facts.snowflake.fact_name, es_type=json_type_to_sqlite_type(ctype), es_column=typed_column(new_column_name, ctype), last_updated=Date.now()) self.add_column(column) # UPDATE THE NESTED VALUES for nested_column_name, nested_value in command.set.items(): if get_type(nested_value) == "nested": nested_table_name = concat_field( self.facts.snowflake.fact_name, nested_column_name) nested_table = nested_tables[nested_column_name] self_primary_key = sql_list( quote_column(c.es_column) for u in self.uid for c in self.columns[u]) extra_key_name = UID_PREFIX + "id" + text_type(len(self.uid)) extra_key = [e for e in nested_table.columns[extra_key_name]][0] sql_command = ("DELETE" + SQL_FROM + quote_column(nested_table.name) + SQL_WHERE + "EXISTS (" + "\nSELECT 1 " + SQL_FROM + quote_column(nested_table.name) + " n" + SQL_INNER_JOIN + "(" + SQL_SELECT + self_primary_key + SQL_FROM + quote_column(abs_schema.fact) + SQL_WHERE + where_sql + "\n) t ON " + SQL_AND.join("t." + quote_column(c.es_column) + " = n." + quote_column(c.es_column) for u in self.uid for c in self.columns[u]) + ")") self.db.execute(sql_command) # INSERT NEW RECORDS if not nested_value: continue doc_collection = {} for d in listwrap(nested_value): nested_table.flatten(d, Data(), doc_collection, path=nested_column_name) prefix = "INSERT INTO " + quote_column( nested_table.name ) + sql_iso( sql_list([self_primary_key] + [quote_column(extra_key)] + [ quote_column(c.es_column) for c in doc_collection.get(".", Null).active_columns ])) # BUILD THE PARENT TABLES parent = (SQL_SELECT + self_primary_key + SQL_FROM + quote_column(abs_schema.fact) + SQL_WHERE + jx_expression(command.where).to_sql(schema)) # BUILD THE RECORDS children = SQL_UNION_ALL.join( SQL_SELECT + quote_value(i) + " " + quote_column(extra_key.es_column) + "," + sql_list( quote_value(row[c.name]) + " " + quote_column(c.es_column) for c in doc_collection.get(".", Null).active_columns) for i, row in enumerate( doc_collection.get(".", Null).rows)) sql_command = (prefix + SQL_SELECT + sql_list([ join_column("p", c.es_column) for u in self.uid for c in self.columns[u] ] + [join_column("c", extra_key)] + [ join_column("c", c.es_column) for c in doc_collection.get(".", Null).active_columns ]) + SQL_FROM + sql_iso(parent) + " p" + SQL_INNER_JOIN + sql_iso(children) + " c" + " ON " + SQL_TRUE) self.db.execute(sql_command) # THE CHILD COLUMNS COULD HAVE EXPANDED # ADD COLUMNS TO SELF for n, cs in nested_table.columns.items(): for c in cs: column = Column(name=c.name, jx_type=c.jx_type, es_type=c.es_type, es_index=c.es_index, es_column=c.es_column, nested_path=[nested_column_name] + c.nested_path, last_updated=Date.now()) if c.name not in self.columns: self.columns[column.name] = {column} elif c.jx_type not in [ c.jx_type for c in self.columns[c.name] ]: self.columns[column.name].add(column) command = ( "UPDATE " + quote_column(abs_schema.fact) + " SET " + sql_list([ quote_column(c) + "=" + quote_value(get_if_type(v, c.jx_type)) for k, v in command.set.items() if get_type(v) != "nested" for c in self.columns[k] if c.jx_type != "nested" and len(c.nested_path) == 1 ] + [ quote_column(c) + "=" + SQL_NULL for k in listwrap(command['clear']) if k in self.columns for c in self.columns[k] if c.jx_type != "nested" and len(c.nested_path) == 1 ]) + SQL_WHERE + where_sql) self.db.execute(command)
""" return column.es_index + "|" + column.es_column META_COLUMNS_DESC = TableDesc( name=META_COLUMNS_NAME, url=None, query_path=ROOT_PATH, last_updated=Date.now(), columns=to_data( [ Column( name=c, es_index=META_COLUMNS_NAME, es_column=c, es_type="keyword", jx_type=STRING, last_updated=Date.now(), nested_path=ROOT_PATH, multi=1, ) for c in [ "name", "es_type", "jx_type", "es_column", "es_index", "partitions", ] ] + [ Column(
def query_metadata(self, query): frum, query['from'] = query['from'], self schema = self.sf.tables["."].schema query = QueryOp.wrap(query, schema) columns = self.sf.columns where = query.where table_name = None column_name = None if query.edges or query.groupby: Log.error("Aggregates(groupby or edge) are not supported") if where.op == "eq" and where.lhs.var == "table": table_name = mo_json.json2value(where.rhs.json) elif where.op == "eq" and where.lhs.var == "name": column_name = mo_json.json2value(where.rhs.json) else: Log.error("Only simple filters are expected like: \"eq\" on table and column name") tables = [concat_field(self.sf.fact_name, i) for i in self.tables.keys()] metadata = [] if columns[-1].es_column != GUID: columns.append(Column( name=GUID, jx_type=STRING, es_column=GUID, es_index=self.sf.fact_name, nested_path=["."] )) for tname, table in zip(t, tables): if table_name != None and table_name != table: continue for col in columns: cname, ctype = untyped_column(col.es_column) if column_name != None and column_name != cname: continue metadata.append((table, relative_field(col.name, tname), col.type, unwraplist(col.nested_path))) if query.format == "cube": num_rows = len(metadata) header = ["table", "name", "type", "nested_path"] temp_data = dict(zip(header, zip(*metadata))) return Data( meta={"format": "cube"}, data=temp_data, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) elif query.format == "table": header = ["table", "name", "type", "nested_path"] return Data( meta={"format": "table"}, header=header, data=metadata ) else: header = ["table", "name", "type", "nested_path"] return Data( meta={"format": "list"}, data=[dict(zip(header, r)) for r in metadata] )
def doc_to_column(doc): return Column(**wrap(untyped(doc)))
def _get_schema_from_list(frum, table_name, parent, nested_path, columns): """ :param frum: The list :param table_name: Name of the table this list holds records for :param parent: parent path :param nested_path: each nested array, in reverse order :param columns: map from full name to column definition :return: """ for d in frum: row_type = python_type_to_json_type[d.__class__] if row_type != "object": # EXPECTING PRIMITIVE VALUE full_name = parent column = columns[full_name] if not column: column = Column( name=concat_field(table_name, full_name), es_column=full_name, es_index=".", es_type=d.__class__.__name__, jx_type=None, # WILL BE SET BELOW last_updated=Date.now(), nested_path=nested_path, ) columns.add(column) column.es_type = _merge_python_type(column.es_type, d.__class__) column.jx_type = python_type_to_json_type[column.es_type] else: for name, value in d.items(): full_name = concat_field(parent, name) column = columns[full_name] if not column: column = Column( name=concat_field(table_name, full_name), es_column=full_name, es_index=".", es_type=value.__class__.__name__, jx_type=None, # WILL BE SET BELOW last_updated=Date.now(), nested_path=nested_path, ) columns.add(column) if is_container(value): # GET TYPE OF MULTIVALUE v = list(value) if len(v) == 0: this_type = none_type.__name__ elif len(v) == 1: this_type = v[0].__class__.__name__ else: this_type = reduce( _merge_python_type, (vi.__class__.__name__ for vi in value) ) else: this_type = value.__class__.__name__ column.es_type = _merge_python_type(column.es_type, this_type) column.jx_type = python_type_to_json_type[column.es_type] if this_type in {"object", "dict", "Mapping", "Data"}: _get_schema_from_list( [value], table_name, full_name, nested_path, columns ) elif this_type in {"list", "FlatList"}: np = listwrap(nested_path) newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list( value, table_name, full_name, newpath, columns )
:param column: :return: Elasticsearch id for column """ return column.es_index + "|" + column.es_column META_COLUMNS_DESC = TableDesc(name=META_COLUMNS_NAME, url=None, query_path=ROOT_PATH, last_updated=Date.now(), columns=wrap([ Column( name=c, es_index=META_COLUMNS_NAME, es_column=c, es_type="keyword", jx_type=STRING, last_updated=Date.now(), nested_path=ROOT_PATH, ) for c in [ "name", "es_type", "jx_type", "nested_path", "es_column", "es_index", "partitions", ] ] + [ Column( name=c,
if this_type == "object": _get_schema_from_list([value], table_name, full_name, nested_path, columns) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist( [join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list(value, table_name, full_name, newpath, columns) METADATA_COLUMNS = ([ Column(names={".": c}, es_index="meta.columns", es_column=c, es_type="string", nested_path=ROOT_PATH) for c in ["es_type", "jx_type", "nested_path", "es_column", "es_index"] ] + [ Column(es_index="meta.columns", names={".": c}, es_column=c, es_type="object", nested_path=ROOT_PATH) for c in ["names", "partitions"] ] + [ Column(names={".": c}, es_index="meta.columns", es_column=c, es_type="long", nested_path=ROOT_PATH) for c in ["count", "cardinality", "multi"]
def _get_schema_from_list( frum, # The list table_name, # Name of the table this list holds records for parent, # parent path nested_path, # each nested array, in reverse order columns, # map from full name to column definition native_type_to_json_type # dict from storage type name to json type name ): for d in frum: row_type = python_type_to_json_type[d.__class__] if row_type != "object": # EXPECTING PRIMITIVE VALUE full_name = parent column = columns[full_name] if not column: column = Column( name=concat_field(table_name, full_name), es_column=full_name, es_index=".", es_type=d.__class__.__name__, jx_type=None, # WILL BE SET BELOW last_updated=Date.now(), nested_path=nested_path, ) columns.add(column) column.es_type = _merge_python_type(column.es_type, d.__class__) column.jx_type = native_type_to_json_type[column.es_type] else: for name, value in d.items(): full_name = concat_field(parent, name) column = columns[full_name] if not column: column = Column( name=concat_field(table_name, full_name), es_column=full_name, es_index=".", es_type=value.__class__.__name__, jx_type=None, # WILL BE SET BELOW last_updated=Date.now(), nested_path=nested_path, ) columns.add(column) if is_container(value): # GET TYPE OF MULTIVALUE v = list(value) if len(v) == 0: this_type = none_type.__name__ elif len(v) == 1: this_type = v[0].__class__.__name__ else: this_type = reduce(_merge_python_type, (vi.__class__.__name__ for vi in value)) else: this_type = value.__class__.__name__ column.es_type = _merge_python_type(column.es_type, this_type) try: column.jx_type = native_type_to_json_type[column.es_type] except Exception as e: raise e if this_type in {"object", "dict", "Mapping", "Data"}: _get_schema_from_list([value], table_name, full_name, nested_path, columns, native_type_to_json_type) elif this_type in {"list", "FlatList"}: np = listwrap(nested_path) newpath = unwraplist( [join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list(value, table_name, full_name, newpath, columns)
def __init__( self, host, index, # THE NAME OF THE SNOWFLAKE (IF WRITING) alias=None, # THE NAME OF THE SNOWFLAKE (FOR READING) type=None, name=None, # THE FULL NAME OF THE TABLE (THE NESTED PATH INTO THE SNOWFLAKE) port=9200, read_only=True, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) wait_for_active_shards=1, # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency) typed=None, kwargs=None): Container.__init__(self) if not container.config.default: container.config.default = { "type": "elasticsearch", "settings": unwrap(kwargs) } self.edges = Data() # SET EARLY, SO OTHER PROCESSES CAN REQUEST IT self.worker = None self.settings = kwargs self._namespace = ElasticsearchMetadata(kwargs=kwargs) self.name = name = self._namespace._find_alias( coalesce(alias, index, name)) if read_only: self.es = elasticsearch.Alias(alias=name, index=None, kwargs=kwargs) else: self.es = elasticsearch.Cluster(kwargs=kwargs).get_index( read_only=read_only, kwargs=kwargs) self._ensure_max_result_window_set(name) self.settings.type = self.es.settings.type self.stats = QueryStats(self.es.cluster) columns = self.snowflake.columns # ABSOLUTE COLUMNS is_typed = any(c.es_column == EXISTS_TYPE for c in columns) if typed == None: # SWITCH ON TYPED MODE self.typed = is_typed else: if is_typed != typed: Log.error( "Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed) self.typed = typed if not typed: # ADD EXISTENCE COLUMNS all_paths = {'.': None} # MAP FROM path TO parent TO MAKE A TREE def nested_path_of(v): if v == '.': return ('.', ) return (v, ) + nested_path_of(all_paths[v]) query_paths = sort_using_key(set( step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p))) for step in query_paths: if step in all_paths: continue else: best = '.' for candidate in all_paths.keys(): if startswith_field(step, candidate): if startswith_field(candidate, best): best = candidate all_paths[step] = best for p in all_paths.keys(): if p == ".": nested_path = ('.', ) else: nested_path = nested_path_of(p)[1:] jx_type = (OBJECT if p == "." else NESTED) self.namespace.meta.columns.add( Column(name=p, es_column=p, es_index=self.name, es_type=jx_type, jx_type=jx_type, cardinality=1, nested_path=nested_path, multi=1001 if jx_type is NESTED else 1, last_updated=Date.now()))
def doc_to_column(doc): kwargs = set_default(untyped(doc), {"last_updated": Date.now() - YEAR}) return Column(**wrap(kwargs))
def _flatten(data, uid, parent_id, order, full_path, nested_path, row=None, guid=None): """ :param data: the data we are pulling apart :param uid: the uid we are giving this doc :param parent_id: the parent id of this (sub)doc :param order: the number of siblings before this one :param full_path: path to this (sub)doc :param nested_path: list of paths, deepest first :param row: we will be filling this :return: """ table = concat_field(self.name, nested_path[0]) insertion = doc_collection[nested_path[0]] if not row: row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order} insertion.rows.append(row) if isinstance(data, Mapping): items = ((concat_field(full_path, k), v) for k, v in wrap(data).leaves()) else: # PRIMITIVE VALUES items = [(full_path, data)] for cname, v in items: value_type = get_type(v) if value_type is None: continue if value_type == NESTED: c = unwraplist([ cc for cc in snowflake.columns if cc.jx_type in STRUCT and untyped_column(cc.name) == cname ]) else: c = unwraplist([ cc for cc in snowflake.columns if cc.jx_type == value_type and cc.name == cname ]) insertion = doc_collection[nested_path[0]] if not c: # WHAT IS THE NESTING LEVEL FOR THIS PATH? deeper_nested_path = "." for path in snowflake.query_paths: if startswith_field(cname, path[0]) and len( deeper_nested_path) < len(path): deeper_nested_path = path c = Column(name=cname, jx_type=value_type, es_type=json_type_to_sqlite_type.get( value_type, value_type), es_column=typed_column( cname, json_type_to_sql_type.get(value_type)), es_index=table, nested_path=nested_path, last_updated=Date.now()) if value_type == "nested": snowflake.query_paths.append(c.es_column) required_changes.append({'nest': (c, nested_path)}) else: snowflake.columns.append(c) required_changes.append({"add": c}) # INSIDE IF BLOCK BECAUSE WE DO NOT WANT IT TO ADD WHAT WE columns.get() ALREADY insertion.active_columns.add(c) elif c.jx_type == "nested" and value_type == "object": value_type = "nested" v = [v] elif len(c.nested_path) < len(nested_path): from_doc = doc_collection.get(c.nested_path[0], None) column = c.es_column from_doc.active_columns.remove(c) snowflake._remove_column(c) required_changes.append({"nest": (c, nested_path)}) deep_c = Column(name=cname, jx_type=value_type, es_type=json_type_to_sqlite_type.get( value_type, value_type), es_column=typed_column( cname, json_type_to_sql_type.get(value_type)), es_index=table, nested_path=nested_path, last_updated=Date.now()) snowflake._add_column(deep_c) snowflake._drop_column(c) from_doc.active_columns.remove(c) for r in from_doc.rows: r1 = unwrap(r) if column in r1: row1 = { UID: self.container.next_uid(), PARENT: r1["__id__"], ORDER: 0, column: r1[column] } insertion.rows.append(row1) elif len(c.nested_path) > len(nested_path): insertion = doc_collection[c.nested_path[0]] row = { UID: self.container.next_uid(), PARENT: uid, ORDER: order } insertion.rows.append(row) # BE SURE TO NEST VALUES, IF NEEDED if value_type == "nested": row[c.es_column] = "." deeper_nested_path = [cname] + nested_path insertion = doc_collection.get(cname, None) if not insertion: insertion = doc_collection[cname] = Data( active_columns=set(), rows=[]) for i, r in enumerate(v): child_uid = self.container.next_uid() _flatten(r, child_uid, uid, i, cname, deeper_nested_path) elif value_type == "object": row[c.es_column] = "." _flatten(v, uid, parent_id, order, cname, nested_path, row=row) elif c.jx_type: insertion.active_columns.add(c) row[c.es_column] = v