def map_to_sql(self, var=""): """ RETURN A MAP FROM THE RELATIVE AND ABSOLUTE NAME SPACE TO COLUMNS """ origin = self.nested_path[0] if startswith_field(var, origin) and origin != var: var = relative_field(var, origin) fact_dict = {} origin_dict = {} for k, cs in self.namespace.items(): for c in cs: if c.jx_type in STRUCT: continue if startswith_field(get_property_name(k), var): origin_dict.setdefault(relative_field(c.name, origin), []).append(c) if origin != c.nested_path[0]: fact_dict.setdefault(c.name, []).append(c) elif origin == var: origin_dict.setdefault( concat_field(var, relative_field(c.name, origin)), []).append(c) if origin != c.nested_path[0]: fact_dict.setdefault(concat_field(var, c.name), []).append(c) return set_default(origin_dict, fact_dict)
def _nest_column(self, column): new_path, type_ = untyped_column(column.es_column) if type_ != SQL_NESTED_TYPE: Log.error("only nested types can be nested") destination_table = concat_field(self.fact_name, new_path) existing_table = concat_field(self.fact_name, column.nested_path[0]) # FIND THE INNER COLUMNS WE WILL BE MOVING moving_columns = [] for c in self.columns: if destination_table != column.es_index and column.es_column == c.es_column: moving_columns.append(c) c.nested_path = new_path # TODO: IF THERE ARE CHILD TABLES, WE MUST UPDATE THEIR RELATIONS TOO? # LOAD THE COLUMNS data = self.namespace.db.about(destination_table) if not data: # DEFINE A NEW TABLE command = ( SQL_CREATE + quote_column(destination_table) + sql_iso(sql_list([ quoted_UID + "INTEGER", quoted_PARENT + "INTEGER", quoted_ORDER + "INTEGER", "PRIMARY KEY " + sql_iso(quoted_UID), "FOREIGN KEY " + sql_iso(quoted_PARENT) + " REFERENCES " + quote_column(existing_table) + sql_iso(quoted_UID) ])) ) with self.namespace.db.transaction() as t: t.execute(command) self.add_table([new_path]+column.nested_path) # TEST IF THERE IS ANY DATA IN THE NEW NESTED ARRAY if not moving_columns: return column.es_index = destination_table with self.namespace.db.transaction() as t: t.execute( "ALTER TABLE " + quote_column(destination_table) + " ADD COLUMN " + quote_column(column.es_column) + " " + column.es_type ) # Deleting parent columns for col in moving_columns: column = col.es_column tmp_table = "tmp_" + existing_table columns = list(map(text, t.query(SQL_SELECT + SQL_STAR + SQL_FROM + quote_column(existing_table) + SQL_LIMIT + SQL_ZERO).header)) t.execute( "ALTER TABLE " + quote_column(existing_table) + " RENAME TO " + quote_column(tmp_table) ) t.execute( SQL_CREATE + quote_column(existing_table) + SQL_AS + SQL_SELECT + sql_list([quote_column(c) for c in columns if c != column]) + SQL_FROM + quote_column(tmp_table) ) t.execute("DROP TABLE " + quote_column(tmp_table))
def _nest_column(self, column, new_path): destination_table = concat_field(self.fact, new_path[0]) existing_table = concat_field(self.fact, column.nested_path[0]) # FIND THE INNER COLUMNS WE WILL BE MOVING moving_columns = [] for c in self._columns: if destination_table != column.es_index and column.es_column == c.es_column: moving_columns.append(c) c.nested_path = new_path # TODO: IF THERE ARE CHILD TABLES, WE MUST UPDATE THEIR RELATIONS TOO? # DEFINE A NEW TABLE? # LOAD THE COLUMNS command = "PRAGMA table_info" + sql_iso( quote_column(destination_table)) details = self.db.query(command) if not details.data: command = ( "CREATE TABLE " + quote_column(destination_table) + sql_iso( sql_list([ quoted_UID + "INTEGER", quoted_PARENT + "INTEGER", quoted_ORDER + "INTEGER", "PRIMARY KEY " + sql_iso(quoted_UID), "FOREIGN KEY " + sql_iso(quoted_PARENT) + " REFERENCES " + quote_column(existing_table) + sql_iso(quoted_UID) ]))) self.db.execute(command) self.add_table_to_schema(new_path) # TEST IF THERE IS ANY DATA IN THE NEW NESTED ARRAY if not moving_columns: return column.es_index = destination_table self.db.execute("ALTER TABLE " + quote_column(destination_table) + " ADD COLUMN " + quote_column(column.es_column) + " " + sql_types[column.type]) # Deleting parent columns for col in moving_columns: column = col.es_column tmp_table = "tmp_" + existing_table columns = list( map( text_type, self.db.query(SQL_SELECT + SQL_STAR + SQL_FROM + quote_column(existing_table) + SQL_LIMIT + SQL_ZERO).header)) self.db.execute("ALTER TABLE " + quote_column(existing_table) + " RENAME TO " + quote_column(tmp_table)) self.db.execute( "CREATE TABLE " + quote_column(existing_table) + " AS " + SQL_SELECT + sql_list([quote_column(c) for c in columns if c != column]) + SQL_FROM + quote_column(tmp_table)) self.db.execute("DROP TABLE " + quote_column(tmp_table))
def _edges_op(self, query, frum): schema = frum query = query.copy() # WE WILL BE MARKING UP THE QUERY index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) outer_selects = [] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) base_table, path = schema.snowflake.fact_name, schema.nested_path nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.snowflake.tables) } tables = [] for n, a in nest_to_alias.items(): if startswith_field(path, n): tables.append({"nest": n, "alias": a}) tables = jx.sort(tables, {"value": {"length": "nest"}}) from_sql = sql_alias(quote_column(concat_field(base_table, tables[0].nest)), tables[0].alias) for previous, t in zip(tables, tables[1::]): from_sql += ( SQL_LEFT_JOIN + sql_alias(quote_column(concat_field(base_table, t.nest)), t.alias) + SQL_ON + quote_column(t.alias, PARENT) + SQL_EQ + quote_column(previous.alias, UID) ) main_filter = SQLang[query.where].to_sql(schema, boolean=True)[0].sql.b # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH ons = [] join_types = [] wheres = [] null_ons = [EXISTS_COLUMN + SQL_IS_NULL] groupby = [] null_groupby = [] orderby = [] domains = [] select_clause = [SQL_ONE + EXISTS_COLUMN] + [quote_column(c.es_column) for c in self.snowflake.columns] for edge_index, query_edge in enumerate(query.edges): edge_alias = "e" + text(edge_index) if query_edge.value: edge_values = [p for c in SQLang[query_edge.value].to_sql(schema).sql for p in c.items()] elif not query_edge.value and any(query_edge.domain.partitions.where): case = SQL_CASE for pp, p in enumerate(query_edge.domain.partitions): w = SQLang[p.where].to_sql(schema)[0].sql.b t = quote_value(pp) case += SQL_WHEN + w + SQL_THEN + t case += SQL_ELSE + SQL_NULL + SQL_END # quote value with length of partitions edge_values = [("n", case)] elif query_edge.range: edge_values = SQLang[query_edge.range.min].to_sql(schema)[0].sql.items() + SQLang[query_edge.range.max].to_sql(schema)[ 0].sql.items()
def _nest_column(self, column, new_path): destination_table = concat_field(self.fact, new_path) existing_table = concat_field(self.fact, column.nested_path[0]) # FIND THE INNER COLUMNS WE WILL BE MOVING moving_columns = [] for c in self.columns: if destination_table != column.es_index and column.es_column == c.es_column: moving_columns.append(c) c.nested_path = [new_path] + c.nested_path # TODO: IF THERE ARE CHILD TABLES, WE MUST UPDATE THEIR RELATIONS TOO? # DEFINE A NEW TABLE? # LOAD THE COLUMNS command = "PRAGMA table_info(" + quote_table(destination_table) + ")" details = self.db.query(command) if not details.data: command = ("CREATE TABLE " + quote_table(destination_table) + "(" + (",".join([ quoted_UID + " INTEGER", quoted_PARENT + " INTEGER", quoted_ORDER + " INTEGER" ])) + ", PRIMARY KEY (" + quoted_UID + ")" + ", FOREIGN KEY (" + quoted_PARENT + ") REFERENCES " + quote_table(existing_table) + "(" + quoted_UID + ")" ")") self.db.execute(command) self.add_table_to_schema([new_path]) # TEST IF THERE IS ANY DATA IN THE NEW NESTED ARRAY if not moving_columns: return column.es_index = destination_table self.db.execute("ALTER TABLE " + quote_table(destination_table) + " ADD COLUMN " + quote_column(column.es_column) + " " + sql_types[column.type]) # Deleting parent columns for col in moving_columns: column = col.es_column tmp_table = "tmp_" + existing_table columns = self.db.query("select * from " + quote_table(existing_table) + " LIMIT 0").header self.db.execute("ALTER TABLE " + quote_table(existing_table) + " RENAME TO " + quote_table(tmp_table)) self.db.execute( "CREATE TABLE " + quote_table(existing_table) + " AS SELECT " + (", ".join([quote_table(c) for c in columns if c != column])) + " FROM " + quote_table(tmp_table)) self.db.execute("DROP TABLE " + quote_table(tmp_table))
def _insert(self, collection): for nested_path, details in collection.items(): active_columns = wrap(list(details.active_columns)) rows = details.rows table_name = concat_field(self.facts.snowflake.fact_name, nested_path) if table_name == self.facts.snowflake.fact_name: # DO NOT REQUIRE PARENT OR ORDER COLUMNS meta_columns = [GUID, UID] else: meta_columns = [UID, PARENT, ORDER] all_columns = meta_columns + active_columns.es_column prefix = ("INSERT INTO " + quote_column(table_name) + sql_iso(sql_list(map(quote_column, all_columns)))) # BUILD THE RECORDS records = SQL_UNION_ALL.join( SQL_SELECT + sql_list(quote_value(row.get(c)) for c in all_columns) for row in unwrap(rows)) with self.db.transaction() as t: t.execute(prefix + records)
def leaves(self): if self in T_PRIMITIVE: yield ".", self else: for k, v in self.__dict__.items(): for p, t in v.leaves(): yield concat_field(k, p), t
def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ output = [{ "table": concat_field(c.es_index, table), "name": name, "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": c.nested_path, "type": c.type } for tname, css in self.data.items() for cname, cs in css.items() for c in cs for table, name in c.names.items()] #+[ # { # "table": tname, # "name": "_id", # "nested_path": ["."], # "type": "string" # } # for tname, _ in self.data.items() # ] if not self.meta_schema: self.meta_schema = get_schema_from_list("meta\\.columns", output) from pyLibrary.queries.containers.list_usingPythonList import ListContainer return ListContainer("meta\\.columns", data=output, schema=self.meta_schema)
def _insert(self, collection): for nested_path, details in collection.items(): active_columns = wrap(list(details.active_columns)) rows = details.rows num_rows = len(rows) table_name = concat_field(self.name, nested_path) if table_name == self.name: # DO NOT REQUIRE PARENT OR ORDER COLUMNS meta_columns = [GUID, UID] else: meta_columns = [UID, PARENT, ORDER] all_columns = meta_columns + active_columns.es_column # ONLY THE PRIMITIVE VALUE COLUMNS command = ConcatSQL([ SQL_INSERT, quote_column(table_name), sql_iso(sql_list(map(quote_column, all_columns))), SQL_VALUES, sql_list( sql_iso( sql_list(quote_value(row.get(c)) for c in all_columns)) for row in unwrap(rows)) ]) with self.db.transaction() as t: t.execute(command)
def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ with self.locker: self._update_meta() output = [ { "table": concat_field(c.es_index, untype_path(table)), "name": untype_path(name), "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "es_type": c.es_type, "type": c.jx_type } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.jx_type not in STRUCT # and c.es_column != "_id" for table, name in c.names.items() ] from jx_python.containers.list_usingPythonList import ListContainer return ListContainer(self.name, data=output, schema=jx_base.Schema("meta.columns", SIMPLE_METADATA_COLUMNS))
def leaves(self, prefix): full_name = concat_field(self.nested_path, prefix) return set( c for c in self.snowflake.namespace.columns.find( self.snowflake.fact_name) for k in [c.name] if startswith_field(k, full_name) and k != GUID or k == full_name if c.jx_type not in [OBJECT, EXISTS])
def _add_column(self, column): cname = column.name if column.jx_type == "nested": # WE ARE ALSO NESTING self._nest_column(column, [cname] + column.nested_path) table = concat_field(self.fact_name, column.nested_path[0]) try: with self.namespace.db.transaction() as t: t.execute("ALTER TABLE" + quote_column(table) + "ADD COLUMN" + quote_column(column.es_column) + column.es_type) self.namespace.columns.add(column) except Exception as e: if "duplicate column name" in e: # THIS HAPPENS WHEN MULTIPLE THREADS ARE ASKING FOR MORE COLUMNS TO STORE DATA # THIS SHOULD NOT BE A PROBLEM SINCE THE THREADS BOTH AGREE THE COLUMNS SHOULD EXIST # BUT, IT WOULD BE NICE TO MAKE LARGER TRANSACTIONS SO THIS NEVER HAPPENS # CONFIRM THE COLUMN EXISTS IN LOCAL DATA STRUCTURES for c in self.namespace.columns: if c.es_column == column.es_column: break else: Log.error("Did not add column {{column}]", column=column.es_column, cause=e) else: Log.error("Did not add column {{column}]", column=column.es_column, cause=e)
def get_pull(column): if column.nested_path[0] == ".": return concat_field("fields", literal_field(column.es_column)) else: depth = len(split_field(column.nested_path[0])) rel_name = split_field(column.es_column)[depth:] return join_field(["_inner"] + rel_name)
def remove_facts(self, fact_name): paths = self.ns.columns._snowflakes[fact_name] if paths: with self.db.transaction() as t: for p in paths: full_name = concat_field(fact_name, p[0]) t.execute("DROP TABLE "+quote_column(full_name)) self.ns.columns.remove_table(fact_name)
def leaves(self): output = set( concat_field(name, leaf) for name, child_schema in self.more.items() for leaf in child_schema.leaves) if self.element.type is not None: output.add('.') return output
def leaves(self): output = set( concat_field(name, leaf) for name, child_schema in self.more.items() for leaf in child_schema.leaves ) if self.element.type is not None: output.add('.') return output
def get_nested_path(typed_path): # CONSTRUCT THE nested_path FROM THE typed_path path = split_field(typed_path) parent = "." nested_path = (parent, ) for i, p in enumerate(path[:-1]): if p == ARRAY_KEY: step = concat_field(parent, join_field(path[0:i + 1])) nested_path = (step, ) + nested_path return nested_path
def _normalize_group(edge, dim_index, limit, schema=None): """ :param edge: Not normalized groupby :param dim_index: Dimensions are ordered; this is this groupby's index into that order :param schema: for context :return: a normalized groupby """ if is_text(edge): if edge.endswith(".*"): prefix = edge[:-2] if schema: output = wrap([ { # BECASUE THIS IS A GROUPBY, EARLY SPLIT INTO LEAVES WORKS JUST FINE "name": concat_field(prefix, literal_field(relative_field(untype_path(c.name), prefix))), "put": {"name": literal_field(untype_path(c.name))}, "value": jx_expression(c.es_column, schema=schema), "allowNulls": True, "domain": {"type": "default"} } for c in schema.leaves(prefix) ]) return output else: return wrap([{ "name": untype_path(prefix), "put": {"name": literal_field(untype_path(prefix))}, "value": LeavesOp(Variable(prefix)), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }]) return wrap([{ "name": edge, "value": jx_expression(edge, schema=schema), "allowNulls": True, "dim": dim_index, "domain": Domain(type="default", limit=limit) }]) else: edge = wrap(edge) if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None: Log.error("groupby does not accept complicated domains") if not edge.name and not is_text(edge.value): Log.error("You must name compound edges: {{edge}}", edge= edge) return wrap([{ "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value, schema=schema), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }])
def _normalize_group(edge, dim_index, limit, schema=None): """ :param edge: Not normalized groupby :param dim_index: Dimensions are ordered; this is this groupby's index into that order :param schema: for context :return: a normalized groupby """ if isinstance(edge, text_type): if edge.endswith(".*"): prefix = edge[:-2] if schema: output = wrap([ { "name": concat_field(prefix, literal_field(relative_field(untype_path(c.names["."]), prefix))), "put": {"name": literal_field(untype_path(c.names["."]))}, "value": jx_expression(c.es_column, schema=schema), "allowNulls": True, "domain": {"type": "default"} } for c in schema.leaves(prefix) ]) return output else: return wrap([{ "name": untype_path(prefix), "put": {"name": literal_field(untype_path(prefix))}, "value": jx_expression(prefix, schema=schema), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }]) return wrap([{ "name": edge, "value": jx_expression(edge, schema=schema), "allowNulls": True, "dim": dim_index, "domain": Domain(type="default", limit=limit) }]) else: edge = wrap(edge) if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None: Log.error("groupby does not accept complicated domains") if not edge.name and not isinstance(edge.value, text_type): Log.error("You must name compound edges: {{edge}}", edge= edge) return wrap([{ "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value, schema=schema), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }])
def map_to_sql(self, var=""): """ RETURN A MAP FROM THE RELATIVE AND ABSOLUTE NAME SPACE TO COLUMNS """ origin = self.nested_path[0] if startswith_field(var, origin) and origin != var: var = relative_field(var, origin) fact_dict = {} origin_dict = {} for k, cs in self.map.items(): for c in cs: if c.type not in STRUCT: if (startswith_field(get_property_name(k), var)): if c.names[origin] in origin_dict: origin_dict[c.names[origin]].append(c) else: origin_dict[c.names[origin]] = [c] if origin != c.nested_path[0]: if c.names["."] in fact_dict: fact_dict[c.names["."]].append(c) else: fact_dict[c.names["."]] = [c] elif origin == var: if concat_field(var, c.names[origin]) in origin_dict: origin_dict[concat_field( var, c.names[origin])].append(c) else: origin_dict[concat_field(var, c.names[origin])] = [c] if origin != c.nested_path[0]: if c.names["."] in fact_dict: fact_dict[concat_field(var, c.names["."])].append(c) else: fact_dict[concat_field(var, c.names["."])] = [c] return set_default(origin_dict, fact_dict)
def _add_column(self, column): cname = column.names["."] if column.type == "nested": # WE ARE ALSO NESTING self._nest_column(column, [cname] + column.nested_path) table = concat_field(self.fact, column.nested_path[0]) self.db.execute("ALTER TABLE " + quote_column(table) + " ADD COLUMN " + quote_column(column.es_column) + " " + sql_types[column.type]) self.add_column_to_schema(column)
def _add_column(self, column): cname = column.name if column.jx_type == "nested": # WE ARE ALSO NESTING self._nest_column(column, [cname] + column.nested_path) table = concat_field(self.fact_name, column.nested_path[0]) with self.namespace.db.transaction() as t: t.execute("ALTER TABLE" + quote_column(table) + "ADD COLUMN" + quote_column(column.es_column) + " " + column.es_type) self.namespace.columns.add(column)
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field( query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table(name=es_index_name, url=None, query_path=['.'], timestamp=Date.now()) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now( ) - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name)
def _drop_column(self, column): # DROP COLUMN BY RENAMING IT, WITH __ PREFIX TO HIDE IT cname = column.name if column.jx_type == "nested": # WE ARE ALSO NESTING self._nest_column(column, [cname] + column.nested_path) table = concat_field(self.fact_name, column.nested_path[0]) with self.namespace.db.transaction() as t: t.execute("ALTER TABLE" + quote_column(table) + "RENAME COLUMN" + quote_column(column.es_column) + " TO " + quote_column("__" + column.es_column)) self.namespace.columns.remove(column)
def _inner(schema, parent_name, indent): more_lines = [] for k, v in schema.items(): full_name = concat_field(parent_name, k) details = indent+"* "+_md_code(full_name) if v.type: details += " - "+_md_italic(v.type) else: Log.error("{{full_name}} is missing type", full_name=full_name) if v.description: details += " " + v.description more_lines.append(details) if v.type in ["object", "array", "nested"]: more_lines.extend(_inner(v.properties, full_name, indent+" ")) return more_lines
def _inner(schema, parent_name, indent): more_lines = [] for k,v in schema.items(): full_name = concat_field(parent_name, k) details = indent+"* "+_md_code(full_name) if v.type: details += " - "+_md_italic(v.type) else: Log.error("{{full_name}} is missing type", full_name=full_name) if v.description: details += " " + v.description more_lines.append(details) if v.type in ["object", "array", "nested"]: more_lines.extend(_inner(v.properties, full_name, indent+" ")) return more_lines
def inject_secrets(config): """ INJECT THE SECRETS INTO THE CONFIGURATION :param config: CONFIG DATA ************************************************************************ ** ENSURE YOU HAVE AN ENVIRONMENT VARIABLE SET: ** TASKCLUSTER_ROOT_URL = https://community-tc.services.mozilla.com ************************************************************************ """ with Timer("get secrets"): options = taskcluster.optionsFromEnvironment() secrets = taskcluster.Secrets(options) acc = Data() for s in listwrap(SECRET_NAMES): acc[s] = secrets.get(concat_field(SECRET_PREFIX, s))['secret'] set_default(config, acc)
def get_parquet_metadata(self, path='.'): """ OUTPUT PARQUET METADATA COLUMNS :param path: FOR INTERNAL USE :return: LIST OF SchemaElement """ children = [] for name, child_schema in sort_using_key(self.more.items(), lambda p: p[0]): children.extend( child_schema.get_parquet_metadata(concat_field(path, name))) if self.element.type: children.append(self.element) return [ parquet_thrift.SchemaElement(name=path, num_children=len(children)) ] + children
def get_parquet_metadata( self, path='.' ): """ OUTPUT PARQUET METADATA COLUMNS :param path: FOR INTERNAL USE :return: LIST OF SchemaElement """ children = [] for name, child_schema in sort_using_key(self.more.items(), lambda p: p[0]): children.extend(child_schema.get_parquet_metadata(concat_field(path, name))) if path == '.': return children else: self.element.num_children = len(children) return [self.element] + children
def inject_secrets(config): """ INJECT THE SECRETS INTO THE CONFIGURATION :param config: CONFIG DATA ************************************************************************ ** ENSURE YOU HAVE AN ENVIRONMENT VARIABLE SET: ** TASKCLUSTER_ROOT_URL = https://community-tc.services.mozilla.com ************************************************************************ """ with Timer("get secrets"): secrets = taskcluster.Secrets(config.taskcluster) acc = Data() for s in listwrap(SECRET_NAMES): secret_name = concat_field(SECRET_PREFIX, s) Log.note("get secret named {{name|quote}}", name=secret_name) acc[s] = secrets.get(secret_name)["secret"] set_default(config, acc)
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field(query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table( name=es_index_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name)
def values(self, column_name, exclude_type=STRUCT): """ RETURN ALL COLUMNS THAT column_name REFERS TO """ column_name = unnest_path(column_name) columns = self.columns output = [] for path in self.query_path: full_path = untype_path(concat_field(path, column_name)) for c in columns: if c.jx_type in exclude_type: continue # if c.cardinality == 0: # continue if untype_path(c.name) == full_path: output.append(c) if output: return output return []
def add_column(self, column): """ ADD COLUMN, IF IT DOES NOT EXIST ALREADY """ if column.name not in self.columns: self.columns[column.name] = {column} elif column.type not in [c.type for c in self.columns[column.name]]: self.columns[column.name].add(column) if column.type == "nested": nested_table_name = concat_field(self.name, column.name) # MAKE THE TABLE from jx_sqlite.query_table import QueryTable table = QueryTable(nested_table_name, self.db, exists=False) self.nested_tables[column.name] = table else: self.db.execute("ALTER TABLE " + quote_table(self.name) + " ADD COLUMN " + _quote_column(column) + " " + column.type)
def add(self, full_name, repetition_type, type): """ :param full_name: dot delimited path to the property (use dot (".") for none) :param repetition_type: one of OPTIONAL or NESTED (REQUIRED is not possible) :param json_type: the json type to store :return: """ base_name = self.element.name simple_name = relative_field(full_name, base_name) path = split_field(simple_name) output = self if len(path) == 0: return output._add_one('.', full_name, repetition_type, type) else: fname = base_name for p in path[:-1]: fname = concat_field(fname, p) n = output.more.get(p) output = n or output._add_one(p, fname, OPTIONAL, object) return output._add_one(path[-1], full_name, repetition_type, type)
def convert(self, expr): """ ADD THE ".$value" SUFFIX TO ALL VARIABLES """ if isinstance(expr, Expression): vars_ = expr.vars() rename = {v: concat_field(v, "$value") for v in vars_} return expr.map(rename) if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_keyword(expr): #TODO: LOOKUP SCHEMA AND ADD ALL COLUMNS WITH THIS PREFIX return expr + ".$value" elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, QueryOp): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({ name: self.convert(value) for name, value in expr.items() }) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return self.converter_map.get(k, self._convert_bop)(k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr])
def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ output = [ { "table": concat_field(c.es_index, table), "name": name, "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": c.nested_path, "type": c.type } for tname, css in self.data.items() for cname, cs in css.items() for c in cs for table, name in c.names.items() ] #+[ # { # "table": tname, # "name": "_id", # "nested_path": ["."], # "type": "string" # } # for tname, _ in self.data.items() # ] if not self.meta_schema: self.meta_schema = get_schema_from_list("meta\\.columns", output) from pyLibrary.queries.containers.list_usingPythonList import ListContainer return ListContainer("meta\\.columns", data=output, schema=self.meta_schema)
def convert(self, expr): """ ADD THE ".$value" SUFFIX TO ALL VARIABLES """ if isinstance(expr, Expression): vars_ = expr.vars() rename = {v: concat_field(v, "$value") for v in vars_} return expr.map(rename) if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_variable_name(expr): #TODO: LOOKUP SCHEMA AND ADD ALL COLUMNS WITH THIS PREFIX return expr + ".$value" elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, QueryOp): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({name: self.convert(value) for name, value in expr.items()}) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return self.converter_map.get(k, self._convert_bop)(k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr])
def name(self): return concat_field(self.snowflake.name, self.query_path[0])
def es_setop(es, query): schema = query.frum.schema query_path = schema.query_path[0] split_select = {".": ESSelect('.')} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelect(path) return es_select selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field(select.name, relative_field(untype_path(c.name), term.var)) if c.jx_type == NESTED: get_select('.').use_source = True new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."}, "pull": get_pull_source(c.es_column) }) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."} }) put_index += 1 elif is_op(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select('.').use_source = True new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source(".") }) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select('.').use_source = True for c in leaves: if len(c.nested_path) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": "."}, "pull": lambda row: row._id }) elif c.jx_type == NESTED: get_select('.').use_source = True pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}, "pull": get_pull_source(c.es_column) }) else: get_select(c_nested_path).fields.append(c.es_column) pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))} }) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field(untype_path(relative_field(c.name, schema.query_path[0])), s_column) pull = accumulate_nested_doc(c_nested_path, Variable(relative_field(s_column, unnest_path(c_nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 else: split_scripts = split_expression_by_path(select.value, schema, lang=Painless) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = {"script": text_type(Painless[first(script)].partial_eval().to_es_script(schema))} new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 for n in new_select: if n.pull: continue elif is_op(n.value, Variable): if get_select('.').use_source: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") split_wheres = split_expression_by_path(query.where, schema, lang=ES52) es_query = es_query_proto(query_path, split_select, split_wheres, schema) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort, schema) with Timer("call to ES", silent=True) as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits # Log.note("{{output}}", output=T) try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter", silent=True): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def get_pull(column): if column.nested_path[0] == ".": return concat_field("fields", literal_field(column.es_column)) else: rel_name = relative_field(column.es_column, column.nested_path[0]) return concat_field("_inner", rel_name)
def parse_properties(parent_index_name, parent_name, esProperties): """ RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT """ from pyLibrary.queries.meta import Column columns = FlatList() for name, property in esProperties.items(): index_name = parent_index_name column_name = concat_field(parent_name, name) if property.type == "nested" and property.properties: # NESTED TYPE IS A NEW TYPE DEFINITION # MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH self_columns = parse_properties(index_name, column_name, property.properties) for c in self_columns: c.nested_path = [column_name] + c.nested_path columns.extend(self_columns) columns.append(Column( es_index=index_name, es_column=column_name, names={".": column_name}, type="nested", nested_path=ROOT_PATH )) continue if property.properties: child_columns = parse_properties(index_name, column_name, property.properties) columns.extend(child_columns) columns.append(Column( names={".": column_name}, es_index=index_name, es_column=column_name, nested_path=ROOT_PATH, type="source" if property.enabled == False else "object" )) if property.dynamic: continue if not property.type: continue if property.type == "multi_field": property.type = property.fields[name].type # PULL DEFAULT TYPE for i, (n, p) in enumerate(property.fields.items()): if n == name: # DEFAULT columns.append(Column( table=index_name, es_index=index_name, es_column=column_name, name=column_name, nested_path=ROOT_PATH, type=p.type )) else: columns.append(Column( table=index_name, es_index=index_name, es_column=column_name + "\\." + n, name=column_name + "\\." + n, nested_path=ROOT_PATH, type=p.type )) continue if property.type in ["string", "boolean", "integer", "date", "long", "double"]: columns.append(Column( es_index=index_name, names={".": column_name}, es_column=column_name, nested_path=ROOT_PATH, type=property.type )) if property.index_name and name != property.index_name: columns.append(Column( es_index=index_name, es_column=column_name, names={".":column_name}, nested_path=ROOT_PATH, type=property.type )) elif property.enabled == None or property.enabled == False: columns.append(Column( es_index=index_name, names={".": column_name}, es_column=column_name, nested_path=ROOT_PATH, type="source" if property.enabled==False else "object" )) else: Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=query_path) return columns
def _value_to_column(value, schema, path, counters, def_level): ptype = type(value) ntype, dtype, ltype, jtype, itype, byte_width = python_type_to_all_types[ptype] if jtype is NESTED: if schema.element.repetition_type != REPEATED: Log.error("Expecting {{path|quote}} to be repeated", path=path) new_path = path if not value: _none_to_column(schema, new_path, get_rep_level(counters), def_level) else: try: new_schema = schema.more.get('.') if not new_schema: if schema.locked: # DEFAULT TO REQUIRED ENTRIES new_schema = schema schema.element.repetition_type = REQUIRED else: new_path = path new_value = value[0] ptype = type(new_value) new_schema = schema.add( new_path, OPTIONAL, ptype ) if new_value is None or python_type_to_json_type[ptype] in PRIMITIVE: values[new_path] = [] reps[new_path] = [0] * counters[0] defs[new_path] = [0] * counters[0] for k, new_value in enumerate(value): new_counters = counters + (k,) _value_to_column(new_value, new_schema, new_path, new_counters, def_level+1) finally: schema.element.repetition_type = REPEATED elif jtype is OBJECT: if value is None: if schema.element.repetition_type == REQUIRED: Log.error("{{path|quote}} is required", path=path) _none_to_column(schema, path, get_rep_level(counters), def_level) else: if schema.element.repetition_type == REPEATED: Log.error("Expecting {{path|quote}} to be repeated", path=path) if schema.element.repetition_type == REQUIRED: new_def_level = def_level else: counters = counters + (0,) new_def_level = def_level+1 for name, sub_schema in schema.more.items(): new_path = concat_field(path, name) new_value = value.get(name, None) _value_to_column(new_value, sub_schema, new_path, counters, new_def_level) for name in set(value.keys()) - set(schema.more.keys()): if schema.locked: Log.error("{{path}} is not allowed in the schema", path=path) new_path = concat_field(path, name) new_value = value.get(name, None) ptype = type(new_value) sub_schema = schema.add( new_path, REPEATED if isinstance(new_value, list) else OPTIONAL, ptype ) if python_type_to_json_type[ptype] in PRIMITIVE: values[new_path] = [] reps[new_path] = [0] * counters[0] defs[new_path] = [0] * counters[0] _value_to_column(new_value, sub_schema, new_path, counters, new_def_level) else: if jtype is STRING: value = value.encode('utf8') merge_schema(schema, path, value) values[path].append(value) if schema.element.repetition_type == REQUIRED: reps[path].append(get_rep_level(counters)) defs[path].append(def_level) else: reps[path].append(get_rep_level(counters)) defs[path].append(def_level + 1)
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) selects = wrap([s.copy() for s in listwrap(query.select)]) new_select = FlatList() schema = query.frum.schema columns = schema.columns leaf_columns = set(c.names["."] for c in columns if c.type not in STRUCT and (c.nested_path[0] == "." or c.es_column == c.nested_path[0])) nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".") i = 0 source = "fields" for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp): new_name_prefix = select.name + "\\." if select.name != "." else "" term = select.value.term if isinstance(term, Variable): if term.var == ".": es_query.fields = None source = "_source" for cname, cs in schema.lookup.items(): for c in cs: if c.type not in STRUCT and c.es_column != "_id": new_name = new_name_prefix + literal_field(cname) new_select.append({ "name": new_name, "value": Variable(c.es_column), "put": {"name": new_name, "index": i, "child": "."} }) i += 1 else: prefix = term.var + "." prefix_length = len(prefix) for cname, cs in schema.lookup.items(): if cname.startswith(prefix): suffix = cname[prefix_length:] for c in cs: if c.type not in STRUCT: if es_query.fields is not None: es_query.fields.append(c.es_column) new_name = new_name_prefix + literal_field(suffix) new_select.append({ "name": new_name, "value": Variable(c.es_column), "put": {"name": new_name, "index": i, "child": "."} }) i += 1 elif isinstance(select.value, Variable): if select.value.var == ".": es_query.fields = None source = "_source" new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": i, "child": "."} }) i += 1 elif select.value.var == "_id": new_select.append({ "name": select.name, "value": select.value, "pull": "_id", "put": {"name": select.name, "index": i, "child": "."} }) i += 1 elif select.value.var in nested_columns or [c for c in nested_columns if c.startswith(select.value.var+".")]: es_query.fields = None source = "_source" new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": i, "child": "."} }) i += 1 else: prefix = select.value.var + "." prefix_length = len(prefix) net_columns = [c for c in leaf_columns if c.startswith(prefix)] if not net_columns: # LEAF if es_query.fields is not None: es_query.fields.append(encode_property(select.value.var)) new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": i, "child": "."} }) i += 1 else: # LEAVES OF OBJECT for cname, cs in schema.lookup.items(): if cname.startswith(prefix): for c in cs: if c.type not in STRUCT: if es_query.fields is not None: es_query.fields.append(c.es_column) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": i, "child": cname[prefix_length:]} }) i += 1 else: es_query.script_fields[literal_field(select.name)] = {"script": select.value.to_ruby()} new_select.append({ "name": select.name, "pull": "fields." + literal_field(select.name), "put": {"name": select.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = concat_field("_source", n.value.var) elif isinstance(n.value, Variable): n.pull = concat_field("fields", literal_field(encode_property(n.value.var))) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def es_setop(es, query): schema = query.frum.schema es_query, filters = es_query_template(schema.query_path[0]) nested_filter = None set_default(filters[0], query.where.partial_eval().to_esfilter(schema)) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.stored_fields = FlatList() selects = wrap([s.copy() for s in listwrap(query.select)]) new_select = FlatList() schema = query.frum.schema # columns = schema.columns # nested_columns = set(c.names["."] for c in columns if c.nested_path[0] != ".") es_query.sort = jx_sort_to_es_sort(query.sort, schema) put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp) and isinstance(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field(select.name, relative_field(untype_path(c.names["."]), term.var)) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."}, "pull": get_pull_source(c.es_column) }) put_index += 1 elif c.nested_path[0] != ".": pass # THE NESTED PARENT WILL CAPTURE THIS else: es_query.stored_fields += [c.es_column] new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."} }) put_index += 1 elif isinstance(select.value, Variable): s_column = select.value.var # LEAVES OF OBJECT leaves = schema.leaves(s_column) nested_selects = {} if leaves: if s_column == '.': # PULL ALL SOURCE es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source(".") }) elif any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS es_query.stored_fields = ["_source"] for c in leaves: if len(c.nested_path) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRT LEVEL PROPERTIES jx_name = untype_path(c.names["."]) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)}, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: if len(c.nested_path) == 1: jx_name = untype_path(c.names["."]) if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)}, "pull": get_pull_source(c.es_column) }) else: es_query.stored_fields += [c.es_column] new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": relative_field(jx_name, s_column)} }) else: if not nested_filter: where = filters[0].copy() nested_filter = [where] for k in filters[0].keys(): filters[0][k] = None set_default( filters[0], es_and([where, es_or(nested_filter)]) ) nested_path = c.nested_path[0] if nested_path not in nested_selects: where = nested_selects[nested_path] = Data() nested_filter += [where] where.nested.path = nested_path where.nested.query.match_all = {} where.nested.inner_hits._source = False where.nested.inner_hits.stored_fields += [c.es_column] child = relative_field(untype_path(c.names[schema.query_path[0]]), s_column) pull = accumulate_nested_doc(nested_path, Variable(relative_field(s_column, unnest_path(nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: nested_selects[nested_path].nested.inner_hits.stored_fields += [c.es_column] else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 else: painless = select.value.partial_eval().to_es_script(schema) es_query.script_fields[literal_field(select.name)] = es_script(painless.script(schema)) new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 for n in new_select: if n.pull: continue elif isinstance(n.value, Variable): if es_query.stored_fields[0] == "_source": es_query.stored_fields = ["_source"] n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter"): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def _get_schema_from_list(frum, table_name, parent, nested_path, columns): """ :param frum: The list :param table_name: Name of the table this list holds records for :param parent: parent path :param nested_path: each nested array, in reverse order :param columns: map from full name to column definition :return: """ for d in frum: row_type = python_type_to_json_type[d.__class__] if row_type != "object": # EXPECTING PRIMITIVE VALUE full_name = parent column = columns[full_name] if not column: column = Column( name=concat_field(table_name, full_name), es_column=full_name, es_index=".", es_type=d.__class__.__name__, jx_type=None, # WILL BE SET BELOW last_updated=Date.now(), nested_path=nested_path, ) columns.add(column) column.es_type = _merge_python_type(column.es_type, d.__class__) column.jx_type = python_type_to_json_type[column.es_type] else: for name, value in d.items(): full_name = concat_field(parent, name) column = columns[full_name] if not column: column = Column( name=concat_field(table_name, full_name), es_column=full_name, es_index=".", es_type=value.__class__.__name__, jx_type=None, # WILL BE SET BELOW last_updated=Date.now(), nested_path=nested_path, ) columns.add(column) if is_container(value): # GET TYPE OF MULTIVALUE v = list(value) if len(v) == 0: this_type = none_type.__name__ elif len(v) == 1: this_type = v[0].__class__.__name__ else: this_type = reduce( _merge_python_type, (vi.__class__.__name__ for vi in value) ) else: this_type = value.__class__.__name__ column.es_type = _merge_python_type(column.es_type, this_type) column.jx_type = python_type_to_json_type[column.es_type] if this_type in {"object", "dict", "Mapping", "Data"}: _get_schema_from_list( [value], table_name, full_name, nested_path, columns ) elif this_type in {"list", "FlatList"}: np = listwrap(nested_path) newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list( value, table_name, full_name, newpath, columns )
def es_deepop(es, query): schema = query.frum.schema query_path = schema.query_path[0] # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es_query_template(query_path) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, schema) for i, f in enumerate(es_filters): script = AndOp("and", wheres[i]).partial_eval().to_esfilter(schema) set_default(f, script) if not wheres[1]: # WITHOUT NESTED CONDITIONS, WE MUST ALSO RETURN DOCS WITH NO NESTED RECORDS more_filter = { "and": [ es_filters[0], {"missing": {"field": untype_path(query_path) + "." + EXISTS_TYPE}} ] } else: more_filter = None es_query.size = coalesce(query.limit, DEFAULT_LIMIT) # es_query.sort = jx_sort_to_es_sort(query.sort) map_to_es_columns = schema.map_to_es() # {c.names["."]: c.es_column for c in schema.leaves(".")} query_for_es = query.map(map_to_es_columns) es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema) es_query.fields = [] is_list = isinstance(query.select, list) new_select = FlatList() i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp) and isinstance(s.value.term, Variable): # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS leaves = schema.leaves(s.value.term.var) col_names = set() for c in leaves: if c.nested_path[0] == ".": if c.jx_type == NESTED: continue es_query.fields += [c.es_column] c_name = untype_path(c.names[query_path]) col_names.add(c_name) new_select.append({ "name": concat_field(s.name, c_name), "nested_path": c.nested_path[0], "put": {"name": concat_field(s.name, literal_field(c_name)), "index": i, "child": "."}, "pull": get_pull_function(c) }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS for n in new_select: if n.name.startswith("..") and n.name.lstrip(".") not in col_names: n.put.name = n.name = n.name.lstrip(".") col_names.add(n.name) elif isinstance(s.value, Variable): net_columns = schema.leaves(s.value.var) if not net_columns: new_select.append({ "name": s.name, "nested_path": ".", "put": {"name": s.name, "index": i, "child": "."}, "pull": NULL }) else: for n in net_columns: pull = get_pull_function(n) if n.nested_path[0] == ".": if n.jx_type == NESTED: continue es_query.fields += [n.es_column] # WE MUST FIGURE OUT WHICH NAMESSPACE s.value.var IS USING SO WE CAN EXTRACT THE child for np in n.nested_path: c_name = untype_path(n.names[np]) if startswith_field(c_name, s.value.var): child = relative_field(c_name, s.value.var) break else: child = relative_field(untype_path(n.names[n.nested_path[0]]), s.value.var) new_select.append({ "name": s.name, "pull": pull, "nested_path": n.nested_path[0], "put": { "name": s.name, "index": i, "child": child } }) i += 1 else: expr = s.value for v in expr.vars(): for c in schema[v.var]: if c.nested_path[0] == ".": es_query.fields += [c.es_column] # else: # Log.error("deep field not expected") pull_name = EXPRESSION_PREFIX + s.name map_to_local = MapToLocal(schema) pull = jx_expression_to_function(pull_name) post_expressions[pull_name] = compile_expression(expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.__data__(), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append(es_post( es, Data( query={"filtered": {"filter": more_filter}}, fields=es_query.fields ), query.limit )) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def __new__(cls, e=None, query=None, *args, **kwargs): e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": # if query.groupby: # return object.__new__(DefaultDecoder, e) if is_text(e.value): Log.error("Expecting Variable or Expression, not plain string") if is_op(e.value, LeavesOp): return object.__new__(ObjectDecoder) elif is_op(e.value, TupleOp): # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS if not all(is_op(t, Variable) for t in e.value.terms): Log.error("Can only handle variables in tuples") e.domain = Data( dimension={"fields": e.value.terms} ) return object.__new__(DimFieldListDecoder) elif is_op(e.value, Variable): schema = query.frum.schema cols = schema.leaves(e.value.var) if not cols: return object.__new__(DefaultDecoder) if len(cols) != 1: return object.__new__(ObjectDecoder) col = first(cols) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.cardinality == None: DEBUG and Log.warning( "metadata for column {{name|quote}} (id={{id}}) is not ready", name=concat_field(col.es_index, col.es_column), id=id(col) ) e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) elif col.partitions == None: e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) else: DEBUG and Log.note("id={{id}} has parts!!!", id=id(col)) if col.multi > 1 and len(col.partitions) < 10: return object.__new__(MultivalueDecoder) partitions = col.partitions[:limit:] if e.domain.sort == -1: partitions = list(reversed(sorted(partitions))) else: partitions = sorted(partitions) e.domain = SimpleSetDomain(partitions=partitions, limit=limit) else: return object.__new__(DefaultDecoder) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder) if e.range: return object.__new__(GeneralRangeDecoder) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if is_data(fields): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)