def _make_range_domain(self, domain, column_name): width = (domain.max - domain.min) / domain.interval digits = mo_math.floor(mo_math.log10(width - 1)) if digits == 0: value = "a.value" else: value = SQL("+").join("1" + ("0" * j) + "*" + text_type(chr(ord(b'a') + j)) + ".value" for j in range(digits + 1)) if domain.interval == 1: if domain.min == 0: domain = (SQL_SELECT + value + column_name + SQL_FROM + "__digits__ a") else: domain = (SQL_SELECT + sql_iso(value) + " + " + quote_value(domain.min) + column_name + SQL_FROM + "__digits__ a") else: if domain.min == 0: domain = (SQL_SELECT + value + " * " + quote_value(domain.interval) + column_name + SQL_FROM + "__digits__ a") else: domain = ( SQL_SELECT + sql_iso(value + " * " + quote_value(domain.interval)) + " + " + quote_value(domain.min) + column_name + SQL_FROM + "__digits__ a") for j in range(digits): domain += SQL_INNER_JOIN + "__digits__" + text_type( chr(ord(b'a') + j + 1)) + " ON " + SQL_TRUE domain += SQL_WHERE + value + " < " + quote_value(width) return domain
def _make_range_domain(self, domain, column_name): width = (domain.max - domain.min) / domain.interval digits = Math.floor(Math.log10(width - 1)) if digits == 0: value = "a.value" else: value = "+".join("1" + ("0" * j) + "*" + text_type(chr(ord(b'a') + j)) + ".value" for j in range(digits + 1)) if domain.interval == 1: if domain.min == 0: domain = "SELECT " + value + " " + column_name + \ "\nFROM __digits__ a" else: domain = "SELECT (" + value + ") + " + quote_value(domain.min) + " " + column_name + \ "\nFROM __digits__ a" else: if domain.min == 0: domain = "SELECT " + value + " * " + quote_value(domain.interval) + " " + column_name + \ "\nFROM __digits__ a" else: domain = "SELECT (" + value + " * " + quote_value(domain.interval) + ") + " + quote_value( domain.min) + " " + column_name + \ "\nFROM __digits__ a" for j in range(digits): domain += "\nJOIN __digits__ " + text_type( chr(ord(b'a') + j + 1)) + " ON 1=1" domain += "\nWHERE " + value + " < " + quote_value(width) return domain
def _db_insert_column(self, column): try: self.db.execute( "INSERT INTO" + db_table_name + sql_iso(all_columns) + "VALUES" + sql_iso( sql_list( [ quote_value(column[c.name]) if c.name not in ("nested_path", "partitions") else quote_value(value2json(column[c.name])) for c in METADATA_COLUMNS ] ) ) ) except Exception as e: e = Except.wrap(e) if "UNIQUE constraint failed" in e or " are not unique" in e: # THIS CAN HAPPEN BECAUSE todo HAS OLD COLUMN DATA self.todo.add((UPDATE, column), force=True) else: Log.error("do not know how to handle", cause=e)
def request(self, method, path, headers): now = Date.now() self.inbound_rate.add(now) ready = Signal(path) # TEST CACHE with self.cache_locker: pair = self.cache.get(path) if pair is None: self.cache[path] = (ready, None, None, now) if pair is not None: # REQUEST IS IN THE QUEUE ALREADY, WAIT ready, headers, response, then = pair if response is None: ready.wait() with self.cache_locker: ready, headers, response, timestamp = self.cache.get(path) with self.db.transaction() as t: t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now)) return Response( response, status=200, headers=json.loads(headers) ) # TEST DB db_response = self.db.query("SELECT headers, response FROM cache WHERE path=" + quote_value(path)).data if db_response: headers, response = db_response[0] with self.db.transaction() as t: t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now)) with self.cache_locker: self.cache[path] = (ready, headers, response.encode('latin1'), now) ready.go() return Response( response, status=200, headers=json.loads(headers) ) # MAKE A NETWORK REQUEST self.todo.add((ready, method, path, headers, now)) ready.wait() with self.cache_locker: ready, headers, response, timestamp = self.cache[path] return Response( response, status=200, headers=json.loads(headers) )
def to_sql(self, schema, not_null=False, boolean=False): value = self.value if value == None: return wrap([{"name": "."}]) elif isinstance(value, text): return wrap([{"name": ".", "sql": {"s": quote_value(value)}}]) elif is_number(value): return wrap([{"name": ".", "sql": {"n": quote_value(value)}}]) elif value in [True, False]: return wrap([{"name": ".", "sql": {"b": quote_value(value)}}]) else: return wrap([{"name": ".", "sql": {"j": quote_value(self.json)}}])
def to_sql(self, schema, not_null=False, boolean=False): defult = self.default.to_sql(schema) if len(self.terms) == 0: return defult defult = coalesce(defult[0].sql, SQL_NULL) sep = self.separator.to_sql(schema)[0].sql.s acc = [] for t in self.terms: missing = t.missing().partial_eval() term = t.to_sql(schema, not_null=True)[0].sql if term.s: term_sql = term.s elif term.n: term_sql = "cast(" + term.n + " as text)" else: term_sql = (SQL_CASE + SQL_WHEN + term.b + SQL_THEN + quote_value("true") + SQL_ELSE + quote_value("false") + SQL_END) if isinstance(missing, TrueOp): acc.append(SQL_EMPTY_STRING) elif missing: acc.append( SQL_CASE + SQL_WHEN + sql_iso(missing.to_sql(schema, boolean=True)[0].sql.b) + SQL_THEN + SQL_EMPTY_STRING + SQL_ELSE + sql_iso(sql_concat([sep, term_sql])) + SQL_END) else: acc.append(sql_concat([sep, term_sql])) expr_ = ("substr(" + sql_concat(acc) + ", " + LengthOp(None, self.separator).to_sql(schema)[0].sql.n + "+1)") missing = self.missing() if not missing: return wrap([{"name": ".", "sql": {"s": expr_}}]) else: return wrap([{ "name": ".", "sql": { "s": SQL_CASE + SQL_WHEN + "(" + missing.to_sql(schema, boolean=True)[0].sql.b + ")" + SQL_THEN + "(" + defult + ")" + SQL_ELSE + "(" + expr_ + ")" + SQL_END }, }])
def to_sql(self, schema, not_null=False, boolean=False): term = SQLang[self.term].partial_eval() if is_literal(term): val = term.value if isinstance(val, text): sql = quote_value(len(val)) elif isinstance(val, (float, int)): sql = quote_value(len(convert.value2json(val))) else: return Null else: value = term.to_sql(schema, not_null=not_null)[0].sql.s sql = ConcatSQL((SQL("LENGTH"), sql_iso(value))) return wrap([{"name": ".", "sql": {"n": sql}}])
def _insert(self, collection): for nested_path, details in collection.items(): active_columns = wrap(list(details.active_columns)) rows = details.rows table_name = concat_field(self.facts.snowflake.fact_name, nested_path) if table_name == self.facts.snowflake.fact_name: # DO NOT REQUIRE PARENT OR ORDER COLUMNS meta_columns = [GUID, UID] else: meta_columns = [UID, PARENT, ORDER] all_columns = meta_columns + active_columns.es_column prefix = ("INSERT INTO " + quote_column(table_name) + sql_iso(sql_list(map(quote_column, all_columns)))) # BUILD THE RECORDS records = SQL_UNION_ALL.join( SQL_SELECT + sql_list(quote_value(row.get(c)) for c in all_columns) for row in unwrap(rows)) with self.db.transaction() as t: t.execute(prefix + records)
def _insert(self, collection): for nested_path, details in collection.items(): active_columns = wrap(list(details.active_columns)) rows = details.rows num_rows = len(rows) table_name = concat_field(self.name, nested_path) if table_name == self.name: # DO NOT REQUIRE PARENT OR ORDER COLUMNS meta_columns = [GUID, UID] else: meta_columns = [UID, PARENT, ORDER] all_columns = meta_columns + active_columns.es_column # ONLY THE PRIMITIVE VALUE COLUMNS command = ConcatSQL([ SQL_INSERT, quote_column(table_name), sql_iso(sql_list(map(quote_column, all_columns))), SQL_VALUES, sql_list( sql_iso( sql_list(quote_value(row.get(c)) for c in all_columns)) for row in unwrap(rows)) ]) with self.db.transaction() as t: t.execute(command)
def _make_digits_table(self): existence = self.db.query("PRAGMA table_info(__digits__)") if not existence.data: self.db.execute("CREATE TABLE __digits__(value INTEGER)") self.db.execute("INSERT INTO __digits__ " + SQL_UNION_ALL.join(SQL_SELECT + SQL(quote_value(i)) for i in range(10)))
def _db_load(self): self.last_load = Date.now() result = self._query( SQL_SELECT + "name" + SQL_FROM + "sqlite_master" + SQL_WHERE + SQL_AND.join(["name=" + db_table_name, "type=" + quote_value("table")]) ) if not result.data: self._db_create() return result = self._query( SQL_SELECT + all_columns + SQL_FROM + db_table_name + SQL_ORDERBY + sql_list(map(quote_column, ["es_index", "name", "es_column"])) ) with self.locker: for r in result.data: c = row_to_column(result.header, r) self._add(c)
def _edges_op(self, query, frum): query = query.copy() # WE WILL BE MARKING UP THE QUERY index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) outer_selects = [] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) frum_path = split_field(frum) base_table = join_field(frum_path[0:1]) path = join_field(frum_path[1:]) nest_to_alias = { nested_path: quote_column("__" + unichr(ord('a') + i) + "__") for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } schema = self.sf.tables[relative_field(frum, self.sf.fact)].schema tables = [] for n, a in nest_to_alias.items(): if startswith_field(path, n): tables.append({"nest": n, "alias": a}) tables = jx.sort(tables, {"value": {"length": "nest"}}) from_sql = quote_column(join_field([base_table] + split_field(tables[0].nest))) + tables[0].alias for previous, t in zip(tables, tables[1::]): from_sql += ( SQL_LEFT_JOIN + quote_column(concat_field(base_table, t.nest)) + t.alias + SQL_ON + join_column(t.alias, quoted_PARENT) + " = " + join_column(previous.alias, quoted_UID) ) main_filter = query.where.to_sql(schema, boolean=True)[0].sql.b # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH ons = [] join_types = [] wheres = [] null_ons = [EXISTS_COLUMN + SQL_IS_NULL] groupby = [] null_groupby = [] orderby = [] domains = [] select_clause = [SQL_ONE + EXISTS_COLUMN] + [quote_column(c.es_column) for c in self.sf.tables['.'].columns] for edge_index, query_edge in enumerate(query.edges): edge_alias = quote_column("e" + text_type(edge_index)) if query_edge.value: edge_values = [p for c in query_edge.value.to_sql(schema).sql for p in c.items()] elif not query_edge.value and any(query_edge.domain.partitions.where): case = SQL_CASE for pp, p in enumerate(query_edge.domain.partitions): w = p.where.to_sql(schema)[0].sql.b t = quote_value(pp) case += SQL_WHEN + w + SQL_THEN + t case += SQL_ELSE + SQL_NULL + SQL_END # quote value with length of partitions edge_values = [("n", case)] elif query_edge.range: edge_values = query_edge.range.min.to_sql(schema)[0].sql.items() + query_edge.range.max.to_sql(schema)[ 0].sql.items()
def to_sql(self, schema, not_null=False, boolean=False): default = self.default.to_sql(schema) if len(self.terms) == 0: return default default = coalesce(default[0].sql.s, SQL_NULL) sep = SQLang[self.separator].to_sql(schema)[0].sql.s acc = [] for t in self.terms: t = SQLang[t] missing = t.missing().partial_eval() term = t.to_sql(schema, not_null=True)[0].sql if term.s: term_sql = term.s elif term.n: term_sql = "cast(" + term.n + " as text)" else: term_sql = (SQL_CASE + SQL_WHEN + term.b + SQL_THEN + quote_value("true") + SQL_ELSE + quote_value("false") + SQL_END) if isinstance(missing, TrueOp): acc.append(SQL_EMPTY_STRING) elif missing: acc.append( SQL_CASE + SQL_WHEN + sql_iso(missing.to_sql(schema, boolean=True)[0].sql.b) + SQL_THEN + SQL_EMPTY_STRING + SQL_ELSE + sql_iso(sql_concat_text([sep, term_sql])) + SQL_END) else: acc.append(sql_concat_text([sep, term_sql])) expr_ = "SUBSTR" + sql_iso( sql_list([ sql_concat_text(acc), LengthOp(self.separator).to_sql(schema)[0].sql.n + SQL("+1"), ])) return SQLScript(expr=expr_, data_type=STRING, frum=self, miss=self.missing(), many=False, schema=schema)
def to_sql(self, schema, not_null=False, boolean=False): if not isinstance(self.superset, Literal): Log.error("Not supported") j_value = json2value(self.superset.json) if j_value: var = self.value.to_sql(schema) return SQL_OR.join(sql_iso(var + "==" + quote_value(v)) for v in j_value) else: return wrap([{"name": ".", "sql": {"b": SQL_FALSE}}])
def _work(name, db, sigs, please_stop): try: sigs[0].begin.wait() with db.transaction() as t: sigs[0].done.go() sigs[1].begin.wait() t.execute("INSERT INTO my_table VALUES " + sql_iso(quote_value(name))) sigs[1].done.go() sigs[2].begin.wait() result = t.query("SELECT * FROM my_table WHERE value=" + quote_value(name)) assert len(result.data) == 1 assert result.data[0][0] == name sigs[2].done.go() finally: # RELEASE ALL SIGNALS, THIS IS ENDING BADLY for s in sigs: s.done.go()
def to_sql(self, schema, not_null=False, boolean=False): pattern = quote_value(json2value(self.pattern.json)) value = self.var.to_sql(schema)[0].sql.s return wrap([{ "name": ".", "sql": { "b": value + " REGEXP " + pattern } }])
def _make_digits_table(self): existence = self.db.query("PRAGMA table_info(__digits__)") if not existence.data: with self.db.transaction() as t: t.execute( "CREATE TABLE" + quote_column(DIGITS_TABLE) + "(value INTEGER)" ) t.execute( "INSERT INTO" + quote_column(DIGITS_TABLE) + SQL_UNION_ALL.join(SQL_SELECT + quote_value(i) for i in range(10)) )
def to_sql(self, schema, not_null=False, boolean=False): test = self.term.missing().to_sql(schema, boolean=True)[0].sql.b value = self.term.to_sql(schema, not_null=True)[0].sql acc = [] for t, v in value.items(): if t == "b": acc.append(SQL_CASE + SQL_WHEN + sql_iso(test) + SQL_THEN + SQL_NULL + SQL_WHEN + sql_iso(v) + SQL_THEN + "'true'" + SQL_ELSE + "'false'" + SQL_END) elif t == "s": acc.append(v) else: acc.append("RTRIM(RTRIM(CAST" + sql_iso(v + " as TEXT), " + quote_value("0")) + ", " + quote_value(".") + ")") if not acc: return wrap([{}]) elif len(acc) == 1: return wrap([{"name": ".", "sql": {"s": acc[0]}}]) else: return wrap([{"name": ".", "sql": {"s": sql_coalesce(acc)}}])
def _insert(self, collection): for nested_path, details in collection.items(): active_columns = wrap(list(details.active_columns)) rows = details.rows table_name = concat_field(self.sf.fact, nested_path) if table_name == self.sf.fact: # DO NOT REQUIRE PARENT OR ORDER COLUMNS meta_columns = [GUID, UID] else: meta_columns = [UID, PARENT, ORDER] all_columns = meta_columns + active_columns.es_column prefix = "INSERT INTO " + quote_table(table_name) + \ "(" + ",".join(map(quote_table, all_columns)) + ")" # BUILD THE RECORDS records = " UNION ALL ".join( "\nSELECT " + ",".join(quote_value(row.get(c)) for c in all_columns) for row in unwrap(rows)) self.db.execute(prefix + records)
def update(self, command): """ :param command: EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT """ command = wrap(command) # REJECT DEEP UPDATES touched_columns = command.set.keys() | set(listwrap(command['clear'])) for c in self.schema.columns: if c.name in touched_columns and len(c.nested_path) > 1: Log.error("Deep update not supported") # ADD NEW COLUMNS where = jx_expression(command.where) _vars = where.vars() _map = { v: c.es_column for v in _vars for c in self.columns.get(v, Null) if c.jx_type not in STRUCT } where_sql = where.map(_map).to_sql(self.schema) new_columns = set(command.set.keys()) - set(self.columns.keys()) for new_column_name in new_columns: nested_value = command.set[new_column_name] ctype = get_type(nested_value) column = Column(name=new_column_name, jx_type=ctype, es_index=self.name, es_type=json_type_to_sqlite_type(ctype), es_column=typed_column(new_column_name, ctype), last_updated=Date.now()) self.add_column(column) # UPDATE THE NESTED VALUES for nested_column_name, nested_value in command.set.items(): if get_type(nested_value) == "nested": nested_table_name = concat_field(self.name, nested_column_name) nested_table = nested_tables[nested_column_name] self_primary_key = sql_list( quote_column(c.es_column) for u in self.uid for c in self.columns[u]) extra_key_name = UID + text(len(self.uid)) extra_key = [e for e in nested_table.columns[extra_key_name]][0] sql_command = ( SQL_DELETE + SQL_FROM + quote_column(nested_table.name) + SQL_WHERE + "EXISTS" + sql_iso(SQL_SELECT + SQL_ONE + SQL_FROM + sql_alias(quote_column(nested_table.name), "n") + SQL_INNER_JOIN + sql_iso(SQL_SELECT + self_primary_key + SQL_FROM + quote_column(abs_schema.fact) + SQL_WHERE + where_sql) + " t ON " + SQL_AND.join( quote_column("t", c.es_column) + SQL_EQ + quote_column("n", c.es_column) for u in self.uid for c in self.columns[u]))) self.db.execute(sql_command) # INSERT NEW RECORDS if not nested_value: continue doc_collection = {} for d in listwrap(nested_value): nested_table.flatten(d, Data(), doc_collection, path=nested_column_name) prefix = SQL_INSERT + quote_column(nested_table.name) + sql_iso( sql_list([self_primary_key] + [quote_column(extra_key)] + [ quote_column(c.es_column) for c in doc_collection.get(".", Null).active_columns ])) # BUILD THE PARENT TABLES parent = (SQL_SELECT + self_primary_key + SQL_FROM + quote_column(abs_schema.fact) + SQL_WHERE + jx_expression(command.where).to_sql(schema)) # BUILD THE RECORDS children = SQL_UNION_ALL.join( SQL_SELECT + quote_value(i) + " " + quote_column(extra_key.es_column) + "," + sql_list( quote_value(row[c.name]) + " " + quote_column(c.es_column) for c in doc_collection.get(".", Null).active_columns) for i, row in enumerate( doc_collection.get(".", Null).rows)) sql_command = (prefix + SQL_SELECT + sql_list([ quote_column("p", c.es_column) for u in self.uid for c in self.columns[u] ] + [quote_column("c", extra_key)] + [ quote_column("c", c.es_column) for c in doc_collection.get(".", Null).active_columns ]) + SQL_FROM + sql_iso(parent) + " p" + SQL_INNER_JOIN + sql_iso(children) + " c" + " ON " + SQL_TRUE) self.db.execute(sql_command) # THE CHILD COLUMNS COULD HAVE EXPANDED # ADD COLUMNS TO SELF for n, cs in nested_table.columns.items(): for c in cs: column = Column(name=c.name, jx_type=c.jx_type, es_type=c.es_type, es_index=c.es_index, es_column=c.es_column, nested_path=[nested_column_name] + c.nested_path, last_updated=Date.now()) if c.name not in self.columns: self.columns[column.name] = {column} elif c.jx_type not in [ c.jx_type for c in self.columns[c.name] ]: self.columns[column.name].add(column) command = ( SQL_UPDATE + quote_column(abs_schema.fact) + SQL_SET + sql_list([ quote_column(c) + SQL_EQ + quote_value(get_if_type(v, c.jx_type)) for k, v in command.set.items() if get_type(v) != "nested" for c in self.columns[k] if c.jx_type != "nested" and len(c.nested_path) == 1 ] + [ quote_column(c) + SQL_EQ + SQL_NULL for k in listwrap(command['clear']) if k in self.columns for c in self.columns[k] if c.jx_type != "nested" and len(c.nested_path) == 1 ]) + SQL_WHERE + where_sql) self.db.execute(command)
def _db_worker(self, please_stop): while not please_stop: try: with self._db_transaction(): result = self._query( SQL_SELECT + all_columns + SQL_FROM + db_table_name + SQL_WHERE + "last_updated > " + quote_value(self.last_load) + SQL_ORDERBY + sql_list(map(quote_column, ["es_index", "name", "es_column"])) ) with self.locker: for r in result.data: c = row_to_column(result.header, r) self._add(c) if c.last_updated > self.last_load: self.last_load = c.last_updated updates = self.todo.pop_all() DEBUG and updates and Log.note( "{{num}} columns to push to db", num=len(updates) ) for action, column in updates: while not please_stop: try: with self._db_transaction(): DEBUG and Log.note( "{{action}} db for {{table}}.{{column}}", action=action, table=column.es_index, column=column.es_column, ) if action is EXECUTE: self.db.execute(column) elif action is UPDATE: self.db.execute( "UPDATE" + db_table_name + "SET" + sql_list( [ "count=" + quote_value(column.count), "cardinality=" + quote_value(column.cardinality), "multi=" + quote_value(column.multi), "partitions=" + quote_value( value2json(column.partitions) ), "last_updated=" + quote_value(column.last_updated), ] ) + SQL_WHERE + SQL_AND.join( [ "es_index = " + quote_value(column.es_index), "es_column = " + quote_value(column.es_column), "last_updated < " + quote_value(column.last_updated), ] ) ) elif action is DELETE: self.db.execute( "DELETE FROM" + db_table_name + SQL_WHERE + SQL_AND.join( [ "es_index = " + quote_value(column.es_index), "es_column = " + quote_value(column.es_column), ] ) ) else: self._db_insert_column(column) break except Exception as e: e = Except.wrap(e) if "database is locked" in e: Log.note("metadata database is locked") Till(seconds=1).wait() break else: Log.warning("problem updataing database", cause=e) except Exception as e: Log.warning("problem updating database", cause=e) (Till(seconds=10) | please_stop).wait()
def _set_op(self, query): # GET LIST OF SELECTED COLUMNS vars_ = UNION([ v.var for select in listwrap(query.select) for v in select.value.vars() ]) schema = self.schema known_vars = schema.keys() active_columns = {".": set()} for v in vars_: for c in schema.leaves(v): nest = c.nested_path[0] active_columns.setdefault(nest, set()).add(c) # ANY VARS MENTIONED WITH NO COLUMNS? for v in vars_: if not any(startswith_field(cname, v) for cname in known_vars): active_columns["."].add( Column(name=v, jx_type=IS_NULL, es_column=".", es_index=".", es_type='NULL', nested_path=["."], last_updated=Date.now())) # EVERY COLUMN, AND THE INDEX IT TAKES UP index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) index_to_uid = {} # FROM NESTED PATH TO THE INDEX OF UID sql_selects = [ ] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) nest_to_alias = { nested_path[0]: "__" + unichr(ord('a') + i) + "__" for i, nested_path in enumerate(self.snowflake.query_paths) } sorts = [] if query.sort: for select in query.sort: col = SQLang[select.value].to_sql(schema)[0] for t, sql in col.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) sql_selects.append(sql_alias(sql, column_alias)) if select.sort == -1: sorts.append(quote_column(column_alias) + SQL_IS_NULL) sorts.append(quote_column(column_alias) + " DESC") else: sorts.append(quote_column(column_alias) + SQL_IS_NULL) sorts.append(quote_column(column_alias)) primary_doc_details = Data() # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED nested_path = [] for step, sub_table in self.snowflake.tables: nested_path.insert(0, step) nested_doc_details = { "sub_table": sub_table, "children": [], "index_to_column": {}, "nested_path": nested_path } # INSERT INTO TREE if not primary_doc_details: primary_doc_details = nested_doc_details else: def place(parent_doc_details): if startswith_field(step, parent_doc_details['nested_path'][0]): for c in parent_doc_details['children']: if place(c): return True parent_doc_details['children'].append( nested_doc_details) place(primary_doc_details) alias = nested_doc_details['alias'] = nest_to_alias[step] # WE ALWAYS ADD THE UID column_number = index_to_uid[step] = nested_doc_details[ 'id_coord'] = len(sql_selects) sql_select = quote_column(alias, UID) sql_selects.append( sql_alias(sql_select, _make_column_name(column_number))) if step != ".": # ID AND ORDER FOR CHILD TABLES index_to_column[column_number] = ColumnMapping( sql=sql_select, type="number", nested_path=nested_path, column_alias=_make_column_name(column_number)) column_number = len(sql_selects) sql_select = quote_column(alias, ORDER) sql_selects.append( sql_alias(sql_select, _make_column_name(column_number))) index_to_column[column_number] = ColumnMapping( sql=sql_select, type="number", nested_path=nested_path, column_alias=_make_column_name(column_number)) # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM if step not in active_columns: continue # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE si = 0 for select in listwrap(query.select): try: column_number = len(sql_selects) select.pull = get_column(column_number) db_columns = SQLang[select.value].partial_eval().to_sql( schema) for column in db_columns: for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) column_alias = _make_column_name(column_number) sql_selects.append( sql_alias(unsorted_sql, column_alias)) if startswith_field(schema.path, step) and is_op( select.value, LeavesOp): # ONLY FLATTEN primary_nested_path AND PARENTS, NOT CHILDREN index_to_column[ column_number] = nested_doc_details[ 'index_to_column'][ column_number] = ColumnMapping( push_name=literal_field( get_property_name( concat_field( select.name, column.name))), push_child=".", push_column_name= get_property_name( concat_field( select.name, column.name)), push_column=si, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=nested_path) si += 1 else: index_to_column[ column_number] = nested_doc_details[ 'index_to_column'][ column_number] = ColumnMapping( push_name=select.name, push_child=column.name, push_column_name=select.name, push_column=si, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=nested_path) finally: si += 1 where_clause = BooleanOp(query.where).partial_eval().to_sql( schema, boolean=True)[0].sql.b unsorted_sql = self._make_sql_for_one_nest_in_set_op( ".", sql_selects, where_clause, active_columns, index_to_column) for n, _ in self.snowflake.tables: sorts.append(quote_column(COLUMN + text(index_to_uid[n]))) ordered_sql = ConcatSQL( (SQL_SELECT, SQL_STAR, SQL_FROM, sql_iso(unsorted_sql), SQL_ORDERBY, sql_list(sorts), SQL_LIMIT, quote_value(query.limit))) result = self.db.query(ordered_sql) def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Null output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append( row ) # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc) return output if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Null curr_nested_path = nested_doc_details['nested_path'][0] index_to_column = nested_doc_details[ 'index_to_column'].items() for i, c in index_to_column: value = row[i] if is_list(query.select) or is_op( query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_field = concat_field( c.push_name, c.push_child) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_field = c.push_child if relative_field == ".": if exists(value): doc = value elif exists(value): if doc is Null: doc = Data() doc[relative_field] = value for child_details in nested_doc_details['children']: # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested( rows, row, child_details, doc_id, id_coord) if nested_value != None: push_name = child_details['nested_path'][0] if is_list(query.select) or is_op( query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_field = relative_field( push_name, curr_nested_path) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_field = "." if relative_field == ".": doc = unwraplist(nested_value) else: doc[relative_field] = unwraplist(nested_value) output.append(doc) try: row = rows.pop() except IndexError: return output cols = tuple( [i for i in index_to_column.values() if i.push_name != None]) rows = list(reversed(unwrap(result.data))) if rows: row = rows.pop() data = _accumulate_nested(rows, row, primary_doc_details, None, None) else: data = result.data if query.format == "cube": # for f, full_name in self.snowflake.tables: # if f != '.' or (test_dots(cols) and is_list(query.select)): # num_rows = len(result.data) # num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0 # map_index_to_name = {c.push_column: c.push_column_name for c in cols} # temp_data = [[None] * num_rows for _ in range(num_cols)] # for rownum, d in enumerate(result.data): # for c in cols: # if c.push_child == ".": # temp_data[c.push_column][rownum] = c.pull(d) # else: # column = temp_data[c.push_column][rownum] # if column is None: # column = temp_data[c.push_column][rownum] = {} # column[c.push_child] = c.pull(d) # output = Data( # meta={"format": "cube"}, # data={n: temp_data[c] for c, n in map_index_to_name.items()}, # edges=[{ # "name": "rownum", # "domain": { # "type": "rownum", # "min": 0, # "max": num_rows, # "interval": 1 # } # }] # ) # return output if is_list(query.select) or is_op(query.select.value, LeavesOp): num_rows = len(data) temp_data = { c.push_column_name: [None] * num_rows for c in cols } for rownum, d in enumerate(data): for c in cols: temp_data[c.push_column_name][rownum] = d[c.push_name] return Data(meta={"format": "cube"}, data=temp_data, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }]) else: num_rows = len(data) map_index_to_name = { c.push_column: c.push_column_name for c in cols } temp_data = [data] return Data(meta={"format": "cube"}, data={ n: temp_data[c] for c, n in map_index_to_name.items() }, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }]) elif query.format == "table": # for f, _ in self.snowflake.tables: # if frum.endswith(f): # num_column = MAX([c.push_column for c in cols]) + 1 # header = [None] * num_column # for c in cols: # header[c.push_column] = c.push_column_name # # output_data = [] # for d in result.data: # row = [None] * num_column # for c in cols: # set_column(row, c.push_column, c.push_child, c.pull(d)) # output_data.append(row) # # return Data( # meta={"format": "table"}, # header=header, # data=output_data # ) if is_list(query.select) or is_op(query.select.value, LeavesOp): column_names = [None] * (max(c.push_column for c in cols) + 1) for c in cols: column_names[c.push_column] = c.push_column_name temp_data = [] for rownum, d in enumerate(data): row = [None] * len(column_names) for c in cols: row[c.push_column] = d[c.push_name] temp_data.append(row) return Data(meta={"format": "table"}, header=column_names, data=temp_data) else: column_names = listwrap(query.select).name return Data(meta={"format": "table"}, header=column_names, data=[[d] for d in data]) else: # for f, _ in self.snowflake.tables: # if frum.endswith(f) or (test_dots(cols) and is_list(query.select)): # data = [] # for d in result.data: # row = Data() # for c in cols: # if c.push_child == ".": # row[c.push_name] = c.pull(d) # elif c.num_push_columns: # tuple_value = row[c.push_name] # if not tuple_value: # tuple_value = row[c.push_name] = [None] * c.num_push_columns # tuple_value[c.push_child] = c.pull(d) # else: # row[c.push_name][c.push_child] = c.pull(d) # # data.append(row) # # return Data( # meta={"format": "list"}, # data=data # ) if is_list(query.select) or is_op(query.select.value, LeavesOp): temp_data = [] for rownum, d in enumerate(data): row = {} for c in cols: row[c.push_column_name] = d[c.push_name] temp_data.append(row) return Data(meta={"format": "list"}, data=temp_data) else: return Data(meta={"format": "list"}, data=data)
pull=pull, sql=sql, type=sql_type_to_json_type[json_type], column_alias=sql_name) vals = [v for t, v in edge_values] if query_edge.domain.type == "set": domain_name = quote_column("d" + text_type(edge_index) + "c" + text_type(column_index)) domain_names = [domain_name] if len(edge_names) > 1: Log.error("Do not know how to handle") if query_edge.value: domain = SQL_UNION_ALL.join( SQL_SELECT + sql_alias(quote_value(coalesce(p.dataIndex, i)), quote_column("rownum")) + SQL_COMMA + sql_alias(quote_value(p.value), domain_name) for i, p in enumerate(query_edge.domain.partitions)) if query_edge.allowNulls: domain += (SQL_UNION_ALL + SQL_SELECT + sql_alias( quote_value(len(query_edge.domain.partitions)), quote_column("rownum")) + SQL_COMMA + sql_alias(SQL_NULL, domain_name)) where = None join_type = SQL_LEFT_JOIN if query_edge.allowNulls else SQL_INNER_JOIN on_clause = (SQL_OR.join( join_column(edge_alias, k) + " = " + v for k, v in zip(domain_names, vals)) + SQL_OR + sql_iso( join_column(edge_alias, domain_name) +
pull=pull, sql=sql, type=sql_type_to_json_type[sql_type], column_alias=sql_name ) vals = [v for t, v in edge_values] if query_edge.domain.type == "set": domain_name = "d" + text(edge_index) + "c" + text(column_index) domain_names = [domain_name] if len(edge_names) > 1: Log.error("Do not know how to handle") if query_edge.value: domain = SQL_UNION_ALL.join( SQL_SELECT + sql_alias(quote_value(coalesce(p.dataIndex, i)), quote_column("rownum")) + SQL_COMMA + sql_alias(quote_value(p.value), domain_name) for i, p in enumerate(query_edge.domain.partitions) ) if query_edge.allowNulls: domain += ( SQL_UNION_ALL + SQL_SELECT + sql_alias(quote_value(len(query_edge.domain.partitions)), quote_column("rownum")) + SQL_COMMA + sql_alias(SQL_NULL, domain_name) ) where = None join_type = SQL_LEFT_JOIN if query_edge.allowNulls else SQL_INNER_JOIN on_clause = ( SQL_OR.join( quote_column(edge_alias, k) + " = " + v for k, v in zip(domain_names, vals)
def get_tuids(self, branch, revision, files): """ GET TUIDS FROM ENDPOINT, AND STORE IN DB :param branch: BRANCH TO FIND THE REVISION/FILE :param revision: THE REVISION NUNMBER :param files: THE FULL PATHS TO THE FILES :return: MAP FROM FILENAME TO TUID LIST """ # SCRUB INPUTS revision = revision[:12] files = [file.lstrip('/') for file in files] with Timer( "ask tuid service for {{num}} files at {{revision|left(12)}}", {"num": len(files), "revision": revision}, silent=not self.enabled ): response = self.db.query( "SELECT file, tuids FROM tuid WHERE revision=" + quote_value(revision) + " AND file IN " + quote_list(files) ) found = {file: json2value(tuids) for file, tuids in response.data} try: remaining = set(files) - set(found.keys()) new_response = None if remaining: request = wrap({ "from": "files", "where": {"and": [ {"eq": {"revision": revision}}, {"in": {"path": remaining}}, {"eq": {"branch": branch}} ]}, "branch": branch, "meta": { "format": "list", "request_time": Date.now() } }) if self.push_queue is not None: if DEBUG: Log.note("record tuid request to SQS: {{timestamp}}", timestamp=request.meta.request_time) self.push_queue.add(request) else: if DEBUG: Log.note("no recorded tuid request") if not self.enabled: return found new_response = http.post_json( self.endpoint, json=request, timeout=self.timeout ) with self.db.transaction() as transaction: command = "INSERT INTO tuid (revision, file, tuids) VALUES " + sql_list( quote_list((revision, r.path, value2json(r.tuids))) for r in new_response.data if r.tuids != None ) if not command.endswith(" VALUES "): transaction.execute(command) self.num_bad_requests = 0 found.update({r.path: r.tuids for r in new_response.data} if new_response else {}) return found except Exception as e: self.num_bad_requests += 1 Till(seconds=SLEEP_ON_ERROR).wait() if self.enabled and self.num_bad_requests >= 3: self.enabled = False Log.error("TUID service has problems.", cause=e) return found
def test_very_distant_files(service): new_rev = "6e8e861540e6" old_rev = "1e2c9151a09e" test_files = ["docshell/base/nsDocShell.cpp"] with service.conn.transaction() as t: t.execute("DELETE FROM annotations WHERE revision = " + quote_value(new_rev)) for file in test_files: t.execute("UPDATE latestFileMod SET revision = " + quote_value(old_rev) + " WHERE file = " + quote_value(file)) old_tuids, _ = service.get_tuids_from_files(test_files, old_rev, use_thread=False, max_csets_proc=10000) new_tuids, _ = service.get_tuids_from_files(test_files, new_rev, use_thread=False, max_csets_proc=10000) lines_moved = {"docshell/base/nsDocShell.cpp": {1028: 1026, 1097: 1029}} lines_added = {"docshell/base/nsDocShell.cpp": [2770]} Log.note("Check output manually for any abnormalities as well.") completed = 0 for file, old_file_tuids in old_tuids: if file in lines_moved: old_moved_tuids = {} print("OLD:") for tuid_map in old_file_tuids: print(str(tuid_map.line) + ":" + str(tuid_map.tuid)) if tuid_map.line in lines_moved[file].keys(): old_moved_tuids[tuid_map.line] = tuid_map.tuid assert len(old_moved_tuids) == len(lines_moved[file].keys()) print("\n\nNEW:") new_moved_tuids = {} for new_file, tmp_tuids in new_tuids: if new_file == file: tmp_lines = [ lines_moved[file][line] for line in lines_moved[file] ] for tuid_map in tmp_tuids: print(str(tuid_map.line) + ":" + str(tuid_map.tuid)) if tuid_map.line in tmp_lines: new_moved_tuids[tuid_map.line] = tuid_map.tuid break assert len(new_moved_tuids) == len(old_moved_tuids) for line_moved in old_moved_tuids: old_tuid = old_moved_tuids[line_moved] new_line = lines_moved[file][line_moved] assert new_line in new_moved_tuids assert old_tuid == new_moved_tuids[new_line] completed += 1 if file in lines_added: for new_file, tmp_tuids in new_tuids: new_file_tuids = [] if new_file == file: for tuid_map in tmp_tuids: if tuid_map.line in lines_added[file]: new_file_tuids.append(tuid_map.tuid) new_file_tuids = tmp_tuids # No tuids from the new should be in the old # so this intersection should always be empty. assert len( set(new_file_tuids) & set([t.tuid for t in old_file_tuids])) <= 0 completed += 1 break assert completed == len(lines_moved.keys()) + len(lines_added.keys())
def _set_op(self, query, frum): # GET LIST OF COLUMNS frum_path = split_field(frum) primary_nested_path = join_field(frum_path[1:]) vars_ = UNION([s.value.vars() for s in listwrap(query.select)]) schema = self.sf.tables[primary_nested_path].schema nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } active_columns = {".": []} for cname, cols in schema.items(): if any(startswith_field(cname, v) for v in vars_): for c in cols: if c.type in STRUCT: continue nest = c.nested_path[0] active = active_columns.get(nest) if not active: active = active_columns[nest] = [] active.append(c) for nested_path, s in self.sf.tables.items(): for cname, cols in s.schema.items(): if not any(startswith_field(cname, c.names[c.nested_path[0]]) for n, cc in active_columns.items() for c in cc): for c in cols: if c.type in STRUCT: continue nest = c.nested_path[0] active = active_columns.get(nest) if not active: active = active_columns[nest] = [] active.append(c) # ANY VARS MENTIONED WITH NO COLUMNS? for v in vars_: if not any(startswith_field(cname, v) for cname in schema.keys()): active_columns["."].append(Column( names={".": v}, type="null", es_column=".", es_index=".", nested_path=["."] )) # EVERY COLUMN, AND THE INDEX IT TAKES UP index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) index_to_uid = {} # FROM NESTED PATH TO THE INDEX OF UID sql_selects = [] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } sorts = [] if query.sort: for s in query.sort: col = s.value.to_sql(schema)[0] for t, sql in col.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) sql_selects.append(sql + " AS " + column_alias) if s.sort == -1: sorts.append(column_alias + " IS NOT NULL") sorts.append(column_alias + " DESC") else: sorts.append(column_alias + " IS NULL") sorts.append(column_alias) selects = [] primary_doc_details = Data() # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED for nested_path, sub_table in self.sf.tables.items(): nested_doc_details = { "sub_table": sub_table, "children": [], "index_to_column": {}, "nested_path": [nested_path] # fake the real nested path, we only look at [0] anyway } # INSERT INTO TREE if not primary_doc_details: primary_doc_details = nested_doc_details else: def place(parent_doc_details): if startswith_field(nested_path, parent_doc_details['nested_path'][0]): for c in parent_doc_details['children']: if place(c): return True parent_doc_details['children'].append(nested_doc_details) place(primary_doc_details) alias = nested_doc_details['alias'] = nest_to_alias[nested_path] if nested_path=="." and quoted_GUID in vars_: column_number = index_to_uid[nested_path] = nested_doc_details['id_coord'] = len(sql_selects) sql_select = alias + "." + quoted_GUID sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name="_id", push_column_name="_id", push_column=0, push_child=".", sql=sql_select, pull=get_column(column_number), type="string", column_alias=_make_column_name(column_number), nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) query.select = [s for s in listwrap(query.select) if s.name!="_id"] # WE ALWAYS ADD THE UID AND ORDER column_number = index_to_uid[nested_path] = nested_doc_details['id_coord'] = len(sql_selects) sql_select = alias + "." + quoted_UID sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) if nested_path !=".": index_to_column[column_number]=ColumnMapping( sql=sql_select, type="number", nested_path=[nested_path], # fake the real nested path, we only look at [0] anyway column_alias=_make_column_name(column_number) ) column_number = len(sql_selects) sql_select = alias + "." + quote_table(ORDER) sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) index_to_column[column_number]=ColumnMapping( sql=sql_select, type="number", nested_path=[nested_path], # fake the real nested path, we only look at [0] anyway column_alias=_make_column_name(column_number) ) # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM if nested_path not in active_columns: continue if len(active_columns[nested_path]) != 0: # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE si = 0 for s in listwrap(query.select): try: column_number = len(sql_selects) s.pull = get_column(column_number) db_columns = s.value.to_sql(schema) if isinstance(s.value, LeavesOp): for column in db_columns: if isinstance(column.nested_path, list): column.nested_path=column.nested_path[0] if column.nested_path and column.nested_path!=nested_path: continue for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=literal_field(get_property_name(concat_field(s.name, column.name))), push_column_name=get_property_name(concat_field(s.name, column.name)), push_column=si, push_child=".", pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) si += 1 else: for column in db_columns: if isinstance(column.nested_path, list): column.nested_path=column.nested_path[0] if column.nested_path and column.nested_path!=nested_path: continue for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=column.name, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) finally: si += 1 elif startswith_field(nested_path, primary_nested_path): # ADD REQUIRED COLUMNS, FOR DEEP STUFF for ci, c in enumerate(active_columns[nested_path]): if c.type in STRUCT: continue column_number = len(sql_selects) nested_path = c.nested_path unsorted_sql = nest_to_alias[nested_path[0]] + "." + quote_table(c.es_column) column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=relative_field(c.names["."], s.name), pull=get_column(column_number), sql=unsorted_sql, type=c.type, column_alias=column_alias, nested_path=nested_path ) where_clause = query.where.to_sql(schema, boolean=True)[0].sql.b unsorted_sql = self._make_sql_for_one_nest_in_set_op( ".", sql_selects, where_clause, active_columns, index_to_column ) for n, _ in self.sf.tables.items(): sorts.append(COLUMN + text_type(index_to_uid[n])) ordered_sql = ( "SELECT * FROM (\n" + unsorted_sql + "\n)" + "\nORDER BY\n" + ",\n".join(sorts) + "\nLIMIT " + quote_value(query.limit) ) self.db.create_new_functions() #creating new functions: regexp result = self.db.query(ordered_sql) def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Null output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append(row) # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc) return output if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Null curr_nested_path = nested_doc_details['nested_path'][0] index_to_column = nested_doc_details['index_to_column'].items() if index_to_column: for i, c in index_to_column: value = row[i] if value == None: continue if value == '': continue if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path=join_field([c.push_name]+split_field(c.push_child)) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path=c.push_child if relative_path == ".": doc = value elif doc is Null: doc = Data() doc[relative_path] = value else: doc[relative_path] = value for child_details in nested_doc_details['children']: # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested(rows, row, child_details, doc_id, id_coord) if nested_value: push_name = child_details['nested_path'][0] if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path=relative_field(push_name, curr_nested_path) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path="." if relative_path == "." and doc is Null: doc = nested_value elif relative_path == ".": doc[push_name] = unwraplist([v[push_name] for v in nested_value]) elif doc is Null: doc = Data() doc[relative_path] = unwraplist(nested_value) else: doc[relative_path] = unwraplist(nested_value) output.append(doc) try: row = rows.pop() except IndexError: return output cols = tuple([i for i in index_to_column.values() if i.push_name != None]) rows = list(reversed(unwrap(result.data))) if rows: row = rows.pop() data = _accumulate_nested(rows, row, primary_doc_details, None, None) else: data = result.data if query.format == "cube": for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): num_rows = len(result.data) num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0 map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = [[None]*num_rows for _ in range(num_cols)] for rownum, d in enumerate(result.data): for c in cols: if c.push_child == ".": temp_data[c.push_column][rownum] = c.pull(d) else: column = temp_data[c.push_column][rownum] if column is None: column = temp_data[c.push_column][rownum] = Data() column[c.push_child] = c.pull(d) output = Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) return output if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): num_rows = len(data) map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = Data() for rownum, d in enumerate(data): for k, v in d.items(): if temp_data[k] == None: temp_data[k] = [None] * num_rows temp_data[k][rownum] = v return Data( meta={"format": "cube"}, data={n: temp_data[literal_field(n)] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) else: num_rows = len(data) map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = [data] return Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) elif query.format == "table": for f, _ in self.sf.tables.items(): if frum.endswith(f): num_column = MAX([c.push_column for c in cols])+1 header = [None]*num_column for c in cols: header[c.push_column] = c.push_column_name output_data = [] for d in result.data: row = [None] * num_column for c in cols: set_column(row, c.push_column, c.push_child, c.pull(d)) output_data.append(row) return Data( meta={"format": "table"}, header=header, data=output_data ) if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): num_rows = len(data) column_names= [None]*(max(c.push_column for c in cols) + 1) for c in cols: column_names[c.push_column] = c.push_column_name temp_data = [] for rownum, d in enumerate(data): row =[None] * len(column_names) for i, (k, v) in enumerate(sorted(d.items())): for c in cols: if k==c.push_name: row[c.push_column] = v temp_data.append(row) return Data( meta={"format": "table"}, header=column_names, data=temp_data ) else: column_names = listwrap(query.select).name return Data( meta={"format": "table"}, header=column_names, data=[[d] for d in data] ) else: for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): data = [] for d in result.data: row = Data() for c in cols: if c.push_child == ".": row[c.push_name] = c.pull(d) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(d) elif not isinstance(query.select, list): # select is value type row[c.push_child]=c.pull(d) else: row[c.push_name][c.push_child] = c.pull(d) data.append(row) return Data( meta={"format": "list"}, data=data ) if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): temp_data=[] for rownum, d in enumerate(data): row = {} for k, v in d.items(): for c in cols: if c.push_name==c.push_column_name==k: row[c.push_column_name] = v elif c.push_name==k and c.push_column_name!=k: row[c.push_column_name] = v temp_data.append(row) return Data( meta={"format": "list"}, data=temp_data ) else: return Data( meta={"format": "list"}, data=data )
def update(self, command): """ :param command: EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT """ command = wrap(command) # REJECT DEEP UPDATES touched_columns = command.set.keys() | set(listwrap(command['clear'])) for c in self.get_leaves(): if c.name in touched_columns and c.nested_path and len( c.name) > len(c.nested_path[0]): Log.error("Deep update not supported") # ADD NEW COLUMNS where = jx_expression(command.where) _vars = where.vars() _map = { v: c.es_column for v in _vars for c in self.columns.get(v, Null) if c.type not in STRUCT } where_sql = where.map(_map).to_sql() new_columns = set(command.set.keys()) - set(self.columns.keys()) for new_column_name in new_columns: nested_value = command.set[new_column_name] ctype = get_type(nested_value) column = Column(names={".": new_column_name}, type=ctype, es_index=self.sf.fact, es_column=typed_column(new_column_name, ctype)) self.add_column(column) # UPDATE THE NESTED VALUES for nested_column_name, nested_value in command.set.items(): if get_type(nested_value) == "nested": nested_table_name = concat_field(self.sf.fact, nested_column_name) nested_table = nested_tables[nested_column_name] self_primary_key = ",".join( quote_table(c.es_column) for u in self.uid for c in self.columns[u]) extra_key_name = UID_PREFIX + "id" + text_type(len(self.uid)) extra_key = [e for e in nested_table.columns[extra_key_name]][0] sql_command = "DELETE FROM " + quote_table(nested_table.name) + \ "\nWHERE EXISTS (" + \ "\nSELECT 1 " + \ "\nFROM " + quote_table(nested_table.name) + " n" + \ "\nJOIN (" + \ "\nSELECT " + self_primary_key + \ "\nFROM " + quote_table(self.sf.fact) + \ "\nWHERE " + where_sql + \ "\n) t ON " + \ " AND ".join( "t." + quote_table(c.es_column) + " = n." + quote_table(c.es_column) for u in self.uid for c in self.columns[u] ) + \ ")" self.db.execute(sql_command) # INSERT NEW RECORDS if not nested_value: continue doc_collection = {} for d in listwrap(nested_value): nested_table.flatten(d, Data(), doc_collection, path=nested_column_name) prefix = "INSERT INTO " + quote_table(nested_table.name) + \ "(" + \ self_primary_key + "," + \ quote_column(extra_key) + "," + \ ",".join( quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns ) + ")" # BUILD THE PARENT TABLES parent = "\nSELECT " + \ self_primary_key + \ "\nFROM " + quote_table(self.sf.fact) + \ "\nWHERE " + jx_expression(command.where).to_sql() # BUILD THE RECORDS children = " UNION ALL ".join( "\nSELECT " + quote_value(i) + " " + quote_table(extra_key.es_column) + "," + ",".join( quote_value(row[c.name]) + " " + quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns) for i, row in enumerate( doc_collection.get(".", Null).rows)) sql_command = prefix + \ "\nSELECT " + \ ",".join( "p." + quote_table(c.es_column) for u in self.uid for c in self.columns[u] ) + "," + \ "c." + quote_column(extra_key) + "," + \ ",".join( "c." + quote_table(c.es_column) for c in doc_collection.get(".", Null).active_columns ) + \ "\nFROM (" + parent + ") p " + \ "\nJOIN (" + children + \ "\n) c on 1=1" self.db.execute(sql_command) # THE CHILD COLUMNS COULD HAVE EXPANDED # ADD COLUMNS TO SELF for n, cs in nested_table.columns.items(): for c in cs: column = Column(names={".": c.name}, type=c.type, es_index=c.es_index, es_column=c.es_column, nested_path=[nested_column_name] + c.nested_path) if c.name not in self.columns: self.columns[column.name] = {column} elif c.type not in [ c.type for c in self.columns[c.name] ]: self.columns[column.name].add(column) command = ( "UPDATE " + quote_table(self.sf.fact) + " SET " + ",\n".join([ quote_column(c) + "=" + quote_value(get_if_type(v, c.type)) for k, v in command.set.items() if get_type(v) != "nested" for c in self.columns[k] if c.type != "nested" and len(c.nested_path) == 1 ] + [ quote_column(c) + "=NULL" for k in listwrap(command['clear']) if k in self.columns for c in self.columns[k] if c.type != "nested" and len(c.nested_path) == 1 ]) + " WHERE " + where_sql) self.db.execute(command)
def execute(self, sql, params=None): if params: for p in params: sql = sql.replace('?', quote_value(p), 1) return self.transaction.execute(sql)
def create_and_insert_tuids(self, revision): self.replace_line_with_tuidline() line_origins = [] all_new_lines = [] for line_obj in self.lines: line_entry = (line_obj.filename, revision, line_obj.line) if not line_obj.tuid or line_obj.is_new_line: all_new_lines.append(line_obj.line) line_origins.append(line_entry) with self.tuid_service.conn.transaction() as t: # Get the new lines, excluding those that have existing tuids existing_tuids = {} if len(all_new_lines) > 0: try: existing_tuids = { line: tuid for tuid, file, revision, line in t.query( "SELECT tuid, file, revision, line FROM temporal" " WHERE file = " + quote_value(self.filename)+ " AND revision = " + quote_value(revision) + " AND line IN " + quote_set(all_new_lines) ).data } except Exception as e: # Log takes out important output, use print instead self.failed_file = True print("Trying to find new lines: " + str(all_new_lines)) Log.error("Error encountered:", cause=e) insert_entries = [] insert_lines = set(all_new_lines) - set(existing_tuids.keys()) if len(insert_lines) > 0: try: insert_entries = [ (self.tuid_service.tuid(),) + line_origins[linenum-1] for linenum in insert_lines ] insert_into_db_chunked( t, insert_entries, "INSERT INTO temporal (tuid, file, revision, line) VALUES " ) except Exception as e: Log.note( "Failed to insert new tuids (likely due to merge conflict) on {{file}}: {{cause}}", file=self.filename, cause=e ) self.failed_file = True return fmt_inserted_lines = {line: tuid for tuid, _, _, line in insert_entries} for line_obj in self.lines: # If a tuid already exists for this line, # replace, otherwise, use the newly created one. if line_obj.line in existing_tuids: line_obj.tuid = existing_tuids[line_obj.line] elif line_obj.line in fmt_inserted_lines: line_obj.tuid = fmt_inserted_lines[line_obj.line] if not line_obj.tuid: Log.warning( "Cannot find TUID at {{file}} and {{rev}}for: {{line}}", file=self.filename, rev=revision, line=str(line_obj) ) self.failed_file = True return
def get(self, sql, params=None): if params: for p in params: sql = sql.replace('?', quote_value(p), 1) return self.transaction.query(sql).data
def to_sql(self, schema, not_null=False, boolean=False): return wrap([{"name": ".", "sql": {"n": quote_value(self.value)}}])
def get_tuids(self, branch, revision, files): """ GET TUIDS FROM ENDPOINT, AND STORE IN DB :param branch: BRANCH TO FIND THE REVISION/FILE :param revision: THE REVISION NUNMBER :param files: THE FULL PATHS TO THE FILES :return: MAP FROM FILENAME TO TUID LIST """ # SCRUB INPUTS revision = revision[:12] files = [file.lstrip('/') for file in files] with Timer( "ask tuid service for {{num}} files at {{revision|left(12)}}", { "num": len(files), "revision": revision }, silent=not self.enabled): response = self.db.query( "SELECT file, tuids FROM tuid WHERE revision=" + quote_value(revision) + " AND file IN " + quote_list(files)) found = {file: json2value(tuids) for file, tuids in response.data} try: remaining = set(files) - set(found.keys()) new_response = None if remaining: request = wrap({ "from": "files", "where": { "and": [{ "eq": { "revision": revision } }, { "in": { "path": remaining } }, { "eq": { "branch": branch } }] }, "branch": branch, "meta": { "format": "list", "request_time": Date.now() } }) if self.push_queue is not None: if DEBUG: Log.note( "record tuid request to SQS: {{timestamp}}", timestamp=request.meta.request_time) self.push_queue.add(request) else: if DEBUG: Log.note("no recorded tuid request") if not self.enabled: return found new_response = http.post_json(self.endpoint, json=request, timeout=self.timeout) with self.db.transaction() as transaction: command = "INSERT INTO tuid (revision, file, tuids) VALUES " + sql_list( quote_list((revision, r.path, value2json(r.tuids))) for r in new_response.data if r.tuids != None) if not command.endswith(" VALUES "): transaction.execute(command) self.num_bad_requests = 0 found.update( {r.path: r.tuids for r in new_response.data} if new_response else {}) return found except Exception as e: self.num_bad_requests += 1 Till(seconds=SLEEP_ON_ERROR).wait() if self.enabled and self.num_bad_requests >= 3: self.enabled = False Log.error("TUID service has problems.", cause=e) return found
def update(self, command): self.dirty = True try: command = wrap(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": with self.locker: del self.data[eq.es_index] self.todo.add( ( EXECUTE, "DELETE FROM " + db_table_name + SQL_WHERE + " es_index=" + quote_value(eq.es_index), ) ) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all( c[k] == v for k, v in eq.items() ) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": self.todo.add((DELETE, col)) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.todo.add((UPDATE, col)) except Exception as e: Log.error("should not happen", cause=e)