def _run_test(self, config): """ :param config: list of {name: nature} objects :return: test function """ generator = make_const for c in reversed(config): for name, rep_type in c.items()[:1]: generator = rep_type_to_generator[rep_type](name, generator) schema = SchemaTree(locked=True) path = [] for c in config: for name, rep_type in c.items()[:1]: path.append(name) schema.add(join_field(path), rep_type, int) # THESE TESTS ASSUME ONLY ONE LEAF full_name = join_field([name for c in config for name, rep_type in c.items()[:1]]) data, values, rep_level, def_level = zip(*list(generator())) expected_values = {full_name: sum(values, [])} expected_reps = {full_name: sum(rep_level, [])} expected_defs = {full_name: sum(def_level, [])} table = rows_to_columns(list(data), schema) self.assertEqual(table.values, expected_values) self.assertEqual(table.reps, expected_reps) self.assertEqual(table.defs, expected_defs)
def _decode_object(self, index, c, parent_path, query_path, expected_vars): if "." in expected_vars: if len(self.done[0]) <= len(parent_path) and all( d == p for d, p in zip(self.done[0], parent_path)): Log.error("Can not pick up more variables, iterator is done") if query_path: Log.error( "Can not extract objects that contain the iteration", var=join_field(query_path), ) index = self._assign_token(index, c, expected_vars) # c, index = self.skip_whitespace(index) yield index return did_yield = False while True: c, index = self.skip_whitespace(index) if c == b",": continue elif c == b'"': name, index = self.simple_token(index, c) c, index = self.skip_whitespace(index) if c != b":": Log.error("Expecting colon") c, index = self.skip_whitespace(index) child_expected = needed(name, expected_vars) child_path = parent_path + [name] if any(child_expected): if not query_path: index = self._assign_token(index, c, child_expected) elif query_path[0] == name: for index in self._decode_token( index, c, child_path, query_path[1:], child_expected): did_yield = True yield index else: if len(self.done[0]) <= len(child_path): Log.error( "Can not pick up more variables, iterator over {{path}}" " is done", path=join_field(self.done[0]), ) index = self._assign_token(index, c, child_expected) elif query_path and query_path[0] == name: for index in self._decode_token(index, c, child_path, query_path[1:], child_expected): yield index else: index = self.jump_to_end(index, c) elif c == b"}": if not did_yield: yield index break
def untype_path(encoded): if encoded.startswith(".."): remainder = encoded.lstrip(".") back = len(encoded) - len(remainder) - 1 return ("." * back) + join_field(decode_property(c) for c in split_field(remainder) if not c.startswith(TYPE_PREFIX)) else: return join_field(decode_property(c) for c in split_field(encoded) if not c.startswith(TYPE_PREFIX))
def _edges_op(self, query, frum): query = query.copy() # WE WILL BE MARKING UP THE QUERY index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) outer_selects = [] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) frum_path = split_field(frum) base_table = join_field(frum_path[0:1]) path = join_field(frum_path[1:]) nest_to_alias = { nested_path: quote_column("__" + unichr(ord('a') + i) + "__") for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } schema = self.sf.tables[relative_field(frum, self.sf.fact)].schema tables = [] for n, a in nest_to_alias.items(): if startswith_field(path, n): tables.append({"nest": n, "alias": a}) tables = jx.sort(tables, {"value": {"length": "nest"}}) from_sql = quote_column(join_field([base_table] + split_field(tables[0].nest))) + tables[0].alias for previous, t in zip(tables, tables[1::]): from_sql += ( SQL_LEFT_JOIN + quote_column(concat_field(base_table, t.nest)) + t.alias + SQL_ON + join_column(t.alias, quoted_PARENT) + " = " + join_column(previous.alias, quoted_UID) ) main_filter = query.where.to_sql(schema, boolean=True)[0].sql.b # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH ons = [] join_types = [] wheres = [] null_ons = [EXISTS_COLUMN + SQL_IS_NULL] groupby = [] null_groupby = [] orderby = [] domains = [] select_clause = [SQL_ONE + EXISTS_COLUMN] + [quote_column(c.es_column) for c in self.sf.tables['.'].columns] for edge_index, query_edge in enumerate(query.edges): edge_alias = quote_column("e" + text_type(edge_index)) if query_edge.value: edge_values = [p for c in query_edge.value.to_sql(schema).sql for p in c.items()] elif not query_edge.value and any(query_edge.domain.partitions.where): case = SQL_CASE for pp, p in enumerate(query_edge.domain.partitions): w = p.where.to_sql(schema)[0].sql.b t = quote_value(pp) case += SQL_WHEN + w + SQL_THEN + t case += SQL_ELSE + SQL_NULL + SQL_END # quote value with length of partitions edge_values = [("n", case)] elif query_edge.range: edge_values = query_edge.range.min.to_sql(schema)[0].sql.items() + query_edge.range.max.to_sql(schema)[ 0].sql.items()
def _get_schema_from_list(frum, table_name, prefix_path, nested_path, columns): """ :param frum: The list :param table_name: Name of the table this list holds records for :param prefix_path: parent path :param nested_path: each nested array, in reverse order :param columns: map from full name to column definition :return: """ for d in frum: row_type = _type_to_name[d.__class__] if row_type != "object": full_name = join_field(prefix_path) column = columns[full_name] if not column: column = Column(names={table_name: full_name}, es_column=full_name, es_index=".", type="undefined", nested_path=nested_path) columns.add(column) column.type = _merge_type[column.type][row_type] else: for name, value in d.items(): full_name = join_field(prefix_path + [name]) column = columns[full_name] if not column: column = Column(names={table_name: full_name}, es_column=full_name, es_index=".", type="undefined", nested_path=nested_path) columns.add(column) if isinstance(value, list): if len(value) == 0: this_type = "undefined" elif len(value) == 1: this_type = _type_to_name[value[0].__class__] else: this_type = _type_to_name[value[0].__class__] if this_type == "object": this_type = "nested" else: this_type = _type_to_name[value.__class__] new_type = _merge_type[column.type][this_type] column.type = new_type if this_type == "object": _get_schema_from_list([value], table_name, prefix_path + [name], nested_path, columns) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist( [join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list(value, table_name, prefix_path + [name], newpath, columns)
def _nest_column(self, column, new_path): destination_table = join_field([self.name] + split_field(new_path)) existing_table = join_field([self.name] + split_field(column.nested_path[0])) # FIND THE INNER COLUMNS WE WILL BE MOVING new_columns = {} for cname, cols in self.columns.items(): if startswith_field(cname, column.names[self.name]): new_columns[cname] = set() for col in cols: new_columns[cname].add(col) col.nested_path = [new_path] + col.nested_path # TODO: IF THERE ARE CHILD TABLES, WE MUST UPDATE THEIR RELATIONS TOO? # DEFINE A NEW TABLE? # LOAD THE COLUMNS command = "PRAGMA table_info(" + quote_table(destination_table) + ")" details = self.db.query(command) if details.data: raise Log.error("not expected, new nesting!") from jx_sqlite.query_table import QueryTable self.nested_tables[new_path] = sub_table = QueryTable( destination_table, self.db, exists=False) self.db.execute("ALTER TABLE " + quote_table(sub_table.name) + " ADD COLUMN " + quoted_PARENT + " INTEGER") self.db.execute("ALTER TABLE " + quote_table(sub_table.name) + " ADD COLUMN " + quote_table(ORDER) + " INTEGER") for cname, cols in new_columns.items(): for c in cols: sub_table.add_column(c) # TEST IF THERE IS ANY DATA IN THE NEW NESTED ARRAY all_cols = [c for _, cols in sub_table.columns.items() for c in cols] if not all_cols: has_nested_data = "0" elif len(all_cols) == 1: has_nested_data = _quote_column(all_cols[0]) + " is NOT NULL" else: has_nested_data = "COALESCE(" + \ ",".join(_quote_column(c) for c in all_cols) + \ ") IS NOT NULL" # FILL TABLE WITH EXISTING COLUMN DATA command = "INSERT INTO " + quote_table(destination_table) + "(\n" + \ ",\n".join( [quoted_UID, quoted_PARENT, quote_table(ORDER)] + [_quote_column(c) for _, cols in sub_table.columns.items() for c in cols] ) + \ "\n)\n" + \ "\nSELECT\n" + ",".join( [quoted_UID, quoted_UID, "0"] + [_quote_column(c) for _, cols in sub_table.columns.items() for c in cols] ) + \ "\nFROM\n" + quote_table(existing_table) + \ "\nWHERE\n" + has_nested_data self.db.execute(command)
def _get_schema_from_list(frum, table_name, prefix_path, nested_path, columns): """ :param frum: The list :param table_name: Name of the table this list holds records for :param prefix_path: parent path :param nested_path: each nested array, in reverse order :param columns: map from full name to column definition :return: """ for d in frum: row_type = _type_to_name[d.__class__] if row_type != "object": full_name = join_field(prefix_path) column = columns[full_name] if not column: column = Column( names={table_name: full_name}, es_column=full_name, es_index=".", type="undefined", nested_path=nested_path ) columns.add(column) column.type = _merge_type[column.type][row_type] else: for name, value in d.items(): full_name = join_field(prefix_path + [name]) column = columns[full_name] if not column: column = Column( names={table_name: full_name}, es_column=full_name, es_index=".", type="undefined", nested_path=nested_path ) columns.add(column) if isinstance(value, list): if len(value) == 0: this_type = "undefined" elif len(value) == 1: this_type = _type_to_name[value[0].__class__] else: this_type = _type_to_name[value[0].__class__] if this_type == "object": this_type = "nested" else: this_type = _type_to_name[value.__class__] new_type = _merge_type[column.type][this_type] column.type = new_type if this_type == "object": _get_schema_from_list([value], table_name, prefix_path + [name], nested_path, columns) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist([join_field(split_field(np[0])+[name])]+np) _get_schema_from_list(value, table_name, prefix_path + [name], newpath, columns)
def getFrameVariables(self, body): contextVariables = [] columns = self.fromData.columns parentVarNames = set() # ALL PARENTS OF VARIABLES WITH "." IN NAME body = body.replace(".?", ".") for i, c in enumerate(columns): j = body.find(c.name, 0) while j >= 0: s = j j = body.find(c.name, s + 1) test0 = body[s - 1:s + len(c.name) + 1:] test3 = body[s - 8:s + len(c.name):] if test0[:-1] == "\"" + c.name: continue if test3 == "_source." + c.name: continue def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n") body = body.replace(c.name, "-" * len(c.name)) if self.isLean or c.useSource: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getSourceValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = _source[\"" + c.name + "\"];\n") else: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") break return "".join(contextVariables)
def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index]+split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias]+split_field(query_path[0])) self._upsert_column(c)
def set(constants): """ REACH INTO THE MODULES AND OBJECTS TO SET CONSTANTS. THINK OF THIS AS PRIMITIVE DEPENDENCY INJECTION FOR MODULES. USEFUL FOR SETTING DEBUG FLAGS. """ if not constants: return constants = wrap(constants) for k, new_value in constants.leaves(): errors = [] try: old_value = mo_dots_set_attr(sys.modules, k, new_value) continue except Exception as e: errors.append(e) # ONE MODULE IS MISSING, THE CALLING MODULE try: caller_globals = sys._getframe(1).f_globals caller_file = caller_globals["__file__"] if not caller_file.endswith(".py"): raise Exception("do not know how to handle non-python caller") caller_module = caller_file[:-3].replace("/", ".") path = split_field(k) for i, p in enumerate(path): if i == 0: continue prefix = join_field(path[:1]) name = join_field(path[i:]) if caller_module.endswith(prefix): old_value = mo_dots_set_attr(caller_globals, name, new_value) if DEBUG: from mo_logs import Log Log.note( "Changed {{module}}[{{attribute}}] from {{old_value}} to {{new_value}}", module=prefix, attribute=name, old_value=old_value, new_value=new_value ) break except Exception as e: errors.append(e) if errors: from mo_logs import Log Log.error("Can not set constant {{path}}", path=k, cause=errors)
def getFrameVariables(self, body): contextVariables = [] columns = self.fromData.columns parentVarNames = set() # ALL PARENTS OF VARIABLES WITH "." IN NAME body = body.replace(".?", ".") for i, c in enumerate(columns): j = body.find(c.name, 0) while j >= 0: s = j j = body.find(c.name, s + 1) test0 = body[s - 1: s + len(c.name) + 1:] test3 = body[s - 8: s + len(c.name):] if test0[:-1] == "\"" + c.name: continue if test3 == "_source." + c.name: continue def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n") body = body.replace(c.name, "-"*len(c.name)) if self.isLean or c.useSource: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getSourceValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = _source[\"" + c.name + "\"];\n") else: if len(split_field(c.name)) > 1: defParent(join_field(split_field(c.name)[0:-1])) contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") else: contextVariables.append(c.name + " = getDocValue(\"" + c.name + "\");\n") break return "".join(contextVariables)
def get_pull_stats(stats_name, median_name): return jx_expression_to_function({"select": [ {"name": "count", "value": join_field([stats_name, "count"])}, {"name": "sum", "value": join_field([stats_name, "sum"])}, {"name": "min", "value": join_field([stats_name, "min"])}, {"name": "max", "value": join_field([stats_name, "max"])}, {"name": "avg", "value": join_field([stats_name, "avg"])}, {"name": "sos", "value": join_field([stats_name, "sum_of_squares"])}, {"name": "std", "value": join_field([stats_name, "std_deviation"])}, {"name": "var", "value": join_field([stats_name, "variance"])}, {"name": "median", "value": join_field([median_name, "values", "50.0"])} ]})
def find_container(frum, after): """ :param frum: :return: """ global namespace if not namespace: if not container.config.default.settings: Log.error( "expecting jx_base.container.config.default.settings to contain default elasticsearch connection info" ) namespace = ElasticsearchMetadata(container.config.default.settings) if not frum: Log.error("expecting json query expression with from clause") # FORCE A RELOAD namespace.get_columns(frum, after=after) if is_text(frum): if frum in container_cache: return container_cache[frum] path = split_field(frum) if path[0] == "meta": if path[1] == "columns": return namespace.meta.columns.denormalized() elif path[1] == "tables": return namespace.meta.tables else: fact_table_name = join_field(path[:2]) else: fact_table_name = path[0] type_ = container.config.default.type settings = set_default( { "alias": fact_table_name, "name": frum, "exists": True }, container.config.default.settings, ) settings.type = None output = container.type2container[type_](settings) container_cache[frum] = output return output elif is_data(frum) and frum.type and container.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return container.type2container[frum.type](frum.settings) elif is_data(frum) and (frum["from"] or is_container(frum["from"])): from jx_base.query import QueryOp return QueryOp.wrap(frum) elif is_container(frum): return ListContainer("test_list", frum) else: return frum
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception as e: Log.error("{{name}} does not exist", name=fieldname) if is_list(d) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1:]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d]) return fieldname, None
def get_pull(column): if column.nested_path[0] == ".": return concat_field("fields", literal_field(column.es_column)) else: depth = len(split_field(column.nested_path[0])) rel_name = split_field(column.es_column)[depth:] return join_field(["_inner"] + rel_name)
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table( name=short_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception, e: Log.error("Not expected", cause=e)
def typed_encode(value, flake): """ RETURN (typed_value, flake_update, added_nested) TUPLES :param value: THE RECORD TO CONVERT TO STRICT TYPED FORM :param flake: LOOKUP SCHEMA, WILL BE UPDATED WITH CHANGES :return: (record, update, nested) TUPLE """ _ = flake.columns # ENSURE WE HAVE INTERNAL STRUCTURES FILLED output, update, nested = _typed_encode(value, flake.schema) if update: # REFRESH COLUMNS flake._columns = None _ = flake.columns worker = to_data(output) for path, field in flake._top_level_fields.items(): worker[field] = worker[path] worker[path] = None # DO NOT LEAVE ANY EMPTY OBJECT RESIDUE _path = split_field(path) for i, _ in jx.reverse(enumerate(_path)): sub_path = join_field(_path[:i]) v = worker[sub_path] if is_data(v) and not worker[sub_path].keys(): worker[sub_path] = None else: break return output, update, nested
def to_bq(self, schema, not_null=False, boolean=False): if not is_op(self.term, Variable): Log.error("Can only handle Variable") term = self.term.var prefix_length = len(split_field(term)) output = wrap( [ { "name": join_field( split_field(schema.get_column_name(c))[prefix_length:] ), "sql": Variable(schema.get_column_name(c)).to_bq(schema)[0].sql, } for c in schema.columns if startswith_field(c.name, term) and ( ( c.jx_type not in (EXISTS, OBJECT, NESTED) and startswith_field(schema.nested_path[0], c.nested_path[0]) ) or ( c.jx_type not in (EXISTS, OBJECT) and schema.nested_path[0] == c.nested_path[0] ) ) ] ) return output
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception as e: Log.error("{{name}} does not exist", name=fieldname) if is_list(d) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1 :]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d]) return fieldname, None
def unnest_path(encoded): if encoded.startswith(".."): encoded = encoded.lstrip(".") if not encoded: encoded = "." return join_field(decode_property(c) for c in split_field(encoded) if c != NESTED_TYPE)
def get_nested_path(typed_path): # CONSTRUCT THE nested_path FROM THE typed_path path = split_field(typed_path) parent = "." nested_path = (parent, ) for i, p in enumerate(path[:-1]): if p == ARRAY_KEY: step = concat_field(parent, join_field(path[0:i + 1])) nested_path = (step, ) + nested_path return nested_path
def add_column_to_schema(self, nest_to_schema, column): abs_table = literal_field(self.name) abs_name = column.names[abs_table] for nest, schema in nest_to_schema.items(): rel_table = literal_field( join_field([self.name] + split_field(nest))) rel_name = relative_field(abs_name, nest) column.names[rel_table] = rel_name
def exists_variable(path): """ RETURN THE VARIABLE THAT WILL INDICATE OBJECT (OR ARRAY) EXISTS (~e~) """ steps = split_field(path) if not steps: return EXISTS_TYPE if steps[-1] == NESTED_TYPE: steps = steps[:-1] return join_field(steps + [EXISTS_TYPE])
def untyped_column(column_name): """ :param column_name: DATABASE COLUMN NAME :return: (NAME, TYPE) PAIR """ if "$" in column_name: path = split_field(column_name) return join_field(path[:-1]), path[-1][1:] else: return column_name, None
def get_schema_from_list(table_name, frum): """ SCAN THE LIST FOR COLUMN TYPES """ columns = UniqueIndex(keys=(join_field(["names", table_name]), )) _get_schema_from_list(frum, table_name, prefix_path=[], nested_path=ROOT_PATH, columns=columns) return Schema(table_name=table_name, columns=columns)
def defParent(name): # DO NOT MAKE THE SAME PARENT TWICE if name in parentVarNames: return parentVarNames.add(name) if len(split_field(name)) == 1: contextVariables.append("Map " + name + " = new HashMap();\n") else: defParent(join_field(split_field(name)[0:-1])) contextVariables.append(name + " = new HashMap();\n")
def to_sql(self, schema, not_null=False, boolean=False): if not isinstance(self.term, Variable): Log.error("Can only handle Variable") term = self.term.var prefix_length = len(split_field(term)) return wrap([{ "name": join_field(split_field(schema.get_column_name(c))[prefix_length:]), "sql": Variable(schema.get_column_name(c)).to_sql(schema)[0].sql } for n, cols in schema.map_to_sql(term).items() for c in cols])
def _test_mode_wait(query): """ WAIT FOR METADATA TO ARRIVE ON INDEX :param query: dict() OF REQUEST BODY :return: nothing """ try: m = meta.singlton now = Date.now() end_time = now + MINUTE # MARK COLUMNS DIRTY m.meta.columns.update({ "clear": ["partitions", "count", "cardinality", "last_updated"], "where": { "eq": { "es_index": join_field(split_field(query["from"])[0:1]) } } }) # BE SURE THEY ARE ON THE todo QUEUE FOR RE-EVALUATION cols = [ c for c in m.get_columns(table_name=query["from"], force=True) if c.type not in STRUCT ] for c in cols: Log.note("Mark {{column}} dirty at {{time}}", column=c.names["."], time=now) c.last_updated = now - TOO_OLD m.todo.push(c) while end_time > now: # GET FRESH VERSIONS cols = [ c for c in m.get_columns(table_name=query["from"]) if c.type not in STRUCT ] for c in cols: if not c.last_updated or c.cardinality == None: Log.note( "wait for column (table={{col.es_index}}, name={{col.es_column}}) metadata to arrive", col=c) break else: break Till(seconds=1).wait() for c in cols: Log.note( "fresh column name={{column.name}} updated={{column.last_updated|date}} parts={{column.partitions}}", column=c) except Exception, e: Log.warning("could not pickup columns", cause=e)
def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE table_path = split_field(table) es_index = table_path[0] query_path = join_field(table_path[1:]) meta = self.es_metadata.indices[es_index] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[es_index] for _, properties in meta.mappings.items(): properties.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(meta.index, properties, meta)
def _worker(start): output = SchemaTree() root = parquet_schema[index[0]] output.element = root max = start + root.num_children while index[0] < max: name = join_field( split_field(parquet_schema[index[0]].name)[-1:]) index[0] += 1 child = _worker(index[0]) output.more[name] = child return output
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field( query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table(name=es_index_name, url=None, query_path=['.'], timestamp=Date.now()) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now( ) - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name)
def _normalize_select_no_context(select, schema=None): """ SAME NORMALIZE, BUT NO SOURCE OF COLUMNS """ if is_text(select): select = Data(value=select) else: select = to_data(select) output = select.copy() if not select.value: output.name = coalesce(select.name, select.aggregate) if output.name: output.value = jx_expression(".", schema=schema) elif len(select): Log.error(BAD_SELECT, select=select) else: return Null elif is_text(select.value): if select.value.endswith("*"): path = split_field(select.value) var = join_field(path[:-1]) name = var.strip(".") if not name: name = "." output.name = coalesce(select.name, select.aggregate, name) output.value = LeavesOp(Variable(var), prefix=select.prefix) elif select.value == ".": output.name = coalesce(select.name, select.aggregate, ".") output.value = jx_expression(select.value, schema=schema) else: output.name = coalesce(select.name, select.value.lstrip("."), select.aggregate) output.value = jx_expression(select.value, schema=schema) elif is_number(output.value): if not output.name: output.name = text(output.value) output.value = jx_expression(select.value, schema=schema) else: output.value = jx_expression(select.value, schema=schema) if not output.name: Log.error("expecting select to have a name: {{select}}", select=select) if output.name.endswith(".*"): Log.error("{{name|quote}} is invalid select", name=output.name) output.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") output.default = coalesce(select.default, canonical_aggregates[output.aggregate].default) return output
def __init__(self, table_name, columns): """ :param table_name: THE FACT TABLE :param query_path: PATH TO ARM OF SNOWFLAKE :param columns: ALL COLUMNS IN SNOWFLAKE """ self._columns = copy(columns) table_path = split_field(table_name) self.table = join_field( table_path[:1] ) # USED AS AN EXPLICIT STATEMENT OF PERSPECTIVE IN THE DATABASE query_path = join_field( table_path[1:]) # TODO: REPLACE WITH THE nested_path ARRAY if query_path == ".": self.query_path = query_path else: query_path += "." + NESTED_TYPE self.query_path = [ c for c in columns if c.type == NESTED and c.names["."] == query_path ][0].es_column self.lookup, self.lookup_leaves, self.lookup_variables = _indexer( columns, self.query_path)
def unnest_path(encoded): if encoded.startswith(".."): remainder = encoded.lstrip(".") back = len(encoded) - len(remainder) return ("." * back) + unnest_path(remainder) path = split_field(encoded) if not path: return "." if path[-1] == NESTED_TYPE: path = path[:-1] if not path: return "." return join_field([decode_property(c) for c in path[:-1] if not c.startswith(TYPE_PREFIX)] + [decode_property(path[-1])])
def _decode_object(index, c, parent_path, query_path, expected_vars): if "." in expected_vars: if len(done[0]) <= len(parent_path) and all(d == p for d, p in zip(done[0], parent_path)): Log.error("Can not pick up more variables, iterator is done") if query_path: Log.error("Can not extract objects that contain the iteration", var=join_field(query_path)) index = _assign_token(index, c, expected_vars) # c, index = skip_whitespace(index) yield index return did_yield = False while True: c, index = skip_whitespace(index) if c == b',': continue elif c == b'"': name, index = simple_token(index, c) c, index = skip_whitespace(index) if c != b':': Log.error("Expecting colon") c, index = skip_whitespace(index) child_expected = needed(name, expected_vars) child_path = parent_path + [name] if any(child_expected): if not query_path: index = _assign_token(index, c, child_expected) elif query_path[0] == name: for index in _decode_token(index, c, child_path, query_path[1:], child_expected): did_yield = True yield index else: if len(done[0]) <= len(child_path): Log.error("Can not pick up more variables, iterator over {{path}} is done", path=join_field(done[0])) index = _assign_token(index, c, child_expected) elif query_path and query_path[0] == name: for index in _decode_token(index, c, child_path, query_path[1:], child_expected): yield index else: index = jump_to_end(index, c) elif c == b"}": if not did_yield: yield index break
def __init__(self, table_name, columns): """ :param table_name: THE FACT TABLE :param query_path: PATH TO ARM OF SNOWFLAKE :param columns: ALL COLUMNS IN SNOWFLAKE """ table_path = split_field(table_name) self.table = table_path[0] # USED AS AN EXPLICIT STATEMENT OF PERSPECTIVE IN THE DATABASE self.query_path = join_field(table_path[1:]) self._columns = copy(columns) lookup = self.lookup = _index(columns, self.query_path) if self.query_path != ".": alternate = _index(columns, ".") for k,v in alternate.items(): lookup.setdefault(k, v)
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field(query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table( name=es_index_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name)
def new_instance(type, frum, schema=None): """ Factory! """ if not type2container: _delayed_imports() if isinstance(frum, Container): return frum elif isinstance(frum, _Cube): return frum elif isinstance(frum, _Query): return _run(frum) elif is_many(frum): return _ListContainer(frum) elif is_text(frum): # USE DEFAULT STORAGE TO FIND Container if not config.default.settings: Log.error("expecting jx_base.container.config.default.settings to contain default elasticsearch connection info") settings = set_default( { "index": join_field(split_field(frum)[:1:]), "name": frum, }, config.default.settings ) settings.type = None # WE DO NOT WANT TO INFLUENCE THE TYPE BECAUSE NONE IS IN THE frum STRING ANYWAY return type2container["elasticsearch"](settings) elif is_data(frum): frum = wrap(frum) if frum.type and type2container[frum.type]: return type2container[frum.type](frum.settings) elif frum["from"]: frum = copy(frum) frum["from"] = Container(frum["from"]) return _Query.wrap(frum) else: Log.error("Do not know how to handle {{frum|json}}", frum=frum) else: Log.error("Do not know how to handle {{type}}", type=frum.__class__.__name__)
def compile_expression(self, expression, constants=None): # EXPAND EXPRESSION WITH ANY CONSTANTS expression = setValues(expression, constants) fromPath = self.fromData.name # FIRST NAME IS THE INDEX indexName = join_field(split_field(fromPath)[:1:]) context = self.getFrameVariables(expression) if context == "": return addFunctions(expression).head+expression func = UID() code = addFunctions(context+expression) output = code.head + \ 'var ' + func + ' = function(' + indexName + '){\n' + \ context + \ expression + ";\n" + \ '};\n' + \ func + '(_source)\n' return Compiled(output)
def es_query_template(path): """ RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE :param path: :return: """ sub_path = split_field(path)[1:] if sub_path: f0 = {} f1 = {} output = wrap({ "filter": {"and": [ f0, {"nested": { "path": join_field(sub_path), "filter": f1, "inner_hits": {"size": 100000} }} ]}, "from": 0, "size": 0, "sort": [] }) return output, wrap([f0, f1]) else: f0 = {} output = wrap({ "query": {"filtered": { "filter": f0 }}, "from": 0, "size": 0, "sort": [] }) return output, wrap([f0])
def untype_path(path): return join_field(c for c in split_field(path) if not c.startswith(TYPE_PREFIX))
def es_setop(es, query): schema = query.frum.schema query_path = schema.query_path[0] split_select = {".": ESSelect('.')} def get_select(path): es_select = split_select.get(path) if not es_select: es_select = split_select[path] = ESSelect(path) return es_select selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)]) new_select = FlatList() put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field(select.name, relative_field(untype_path(c.name), term.var)) if c.jx_type == NESTED: get_select('.').use_source = True new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."}, "pull": get_pull_source(c.es_column) }) put_index += 1 else: get_select(c.nested_path[0]).fields.append(c.es_column) new_select.append({ "name": full_name, "value": Variable(c.es_column), "put": {"name": literal_field(full_name), "index": put_index, "child": "."} }) put_index += 1 elif is_op(select.value, Variable): s_column = select.value.var if s_column == ".": # PULL ALL SOURCE get_select('.').use_source = True new_select.append({ "name": select.name, "value": select.value, "put": {"name": select.name, "index": put_index, "child": "."}, "pull": get_pull_source(".") }) continue leaves = schema.leaves(s_column) # LEAVES OF OBJECT # nested_selects = {} if leaves: if any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS get_select('.').use_source = True for c in leaves: if len(c.nested_path) == 1: # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}, "pull": get_pull_source(c.es_column) }) else: # PULL ONLY WHAT'S NEEDED for c in leaves: c_nested_path = c.nested_path[0] if c_nested_path == ".": if c.es_column == "_id": new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": "."}, "pull": lambda row: row._id }) elif c.jx_type == NESTED: get_select('.').use_source = True pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))}, "pull": get_pull_source(c.es_column) }) else: get_select(c_nested_path).fields.append(c.es_column) pre_child = join_field(decode_property(n) for n in split_field(c.name)) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": {"name": select.name, "index": put_index, "child": untype_path(relative_field(pre_child, s_column))} }) else: es_select = get_select(c_nested_path) es_select.fields.append(c.es_column) child = relative_field(untype_path(relative_field(c.name, schema.query_path[0])), s_column) pull = accumulate_nested_doc(c_nested_path, Variable(relative_field(s_column, unnest_path(c_nested_path)))) new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": child }, "pull": pull }) else: new_select.append({ "name": select.name, "value": Variable("$dummy"), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 else: split_scripts = split_expression_by_path(select.value, schema, lang=Painless) for p, script in split_scripts.items(): es_select = get_select(p) es_select.scripts[select.name] = {"script": text_type(Painless[first(script)].partial_eval().to_es_script(schema))} new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), "put": {"name": select.name, "index": put_index, "child": "."} }) put_index += 1 for n in new_select: if n.pull: continue elif is_op(n.value, Variable): if get_select('.').use_source: n.pull = get_pull_source(n.value.var) elif n.value == "_id": n.pull = jx_expression_to_function("_id") else: n.pull = jx_expression_to_function(concat_field("fields", literal_field(n.value.var))) else: Log.error("Do not know what to do") split_wheres = split_expression_by_path(query.where, schema, lang=ES52) es_query = es_query_proto(query_path, split_select, split_wheres, schema) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort, schema) with Timer("call to ES", silent=True) as call_timer: data = es_post(es, es_query, query.limit) T = data.hits.hits # Log.note("{{output}}", output=T) try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] with Timer("formatter", silent=True): output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce(kwargs.depth, len(self.fields)-1 if is_list(self.fields) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = FlatList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name":part.name, "value":part.value, "where":part.where, "style":coalesce(part.style, part.parent.style), "weight":part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name":v.name, "value":v.value, "where":v.where, "style":v.style, "weight":v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT)] elif kwargs.depth == 1: partitions = FlatList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name":join_field(split_field(subpart.parent.name) + [subpart.name]), "value":subpart.value, "where":subpart.where, "style":coalesce(subpart.style, subpart.parent.style), "weight":subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception as e: Log.error("", e) else: Log.error("deeper than 2 is not supported yet") return Domain( type=self.type, name=self.name, partitions=wrap(partitions), min=self.min, max=self.max, interval=self.interval, # THE COMPLICATION IS THAT SOMETIMES WE WANT SIMPLE PARTITIONS, LIKE # STRINGS, DATES, OR NUMBERS. OTHER TIMES WE WANT PARTITION OBJECTS # WITH NAME, VALUE, AND OTHER MARKUP. # USUALLY A "set" IS MEANT TO BE SIMPLE, BUT THE end() FUNCTION IS # OVERRIDES EVERYTHING AND IS EXPLICIT. - NOT A GOOD SOLUTION BECAUSE # end() IS USED BOTH TO INDICATE THE QUERY PARTITIONS *AND* DISPLAY # COORDINATES ON CHARTS # PLEASE SPLIT end() INTO value() (replacing the string value) AND # label() (for presentation) value="name" if not self.value and self.partitions else self.value, key="value", label=coalesce(self.label, (self.type == "set" and self.name)), end=coalesce(self.end, (self.type == "set" and self.name)), isFacet=self.isFacet, dimension=self )
def __init__(self, dim, parent, jx): dim = wrap(dim) self.name = dim.name self.parent = coalesce(parent) self.full_name = join_field(split_field(self.parent.full_name)+[self.name]) self.edges = None # FOR NOW dot.set_default(self, dim) self.where = dim.where self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, jx.settings.index) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Data() for e in listwrap(dim.edges): new_e = Dimension(e, self, jx) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif is_data(fields): self.fields = wrap(fields) edges = wrap([{"name": k, "value": v, "allowNulls": False} for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{"name": f, "value": f, "index": i, "allowNulls": False} for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if self.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH jx.get_columns() with Timer("Get parts of {{name}}", {"name": self.name}): parts = jx.query({ "from": self.index, "select": {"name": "count", "aggregate": "count"}, "edges": edges, "where": self.where, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name= self.name, num= len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Data(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not is_list(a): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts( temp, dim.path(d.getEnd(d.partitions[i])), count, 0 ) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif is_data(fields): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = FlatList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "where": {"and": [ {"term": {e.value: g[e.name]}} for e in edges ]}, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values()[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if is_data(fields): output = Data() for e, v in transpose(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "where": {"and": [ {"term": {edges[0].value: d.partitions[i].value}}, {"term": {edges[1].value: d2.partitions[j].value}} ]}, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
def _get_schema_from_list(frum, table_name, parent, nested_path, columns): """ :param frum: The list :param table_name: Name of the table this list holds records for :param parent: parent path :param nested_path: each nested array, in reverse order :param columns: map from full name to column definition :return: """ for d in frum: row_type = python_type_to_json_type[d.__class__] if row_type != "object": # EXPECTING PRIMITIVE VALUE full_name = parent column = columns[full_name] if not column: column = Column( name=concat_field(table_name, full_name), es_column=full_name, es_index=".", es_type=d.__class__.__name__, jx_type=None, # WILL BE SET BELOW last_updated=Date.now(), nested_path=nested_path, ) columns.add(column) column.es_type = _merge_python_type(column.es_type, d.__class__) column.jx_type = python_type_to_json_type[column.es_type] else: for name, value in d.items(): full_name = concat_field(parent, name) column = columns[full_name] if not column: column = Column( name=concat_field(table_name, full_name), es_column=full_name, es_index=".", es_type=value.__class__.__name__, jx_type=None, # WILL BE SET BELOW last_updated=Date.now(), nested_path=nested_path, ) columns.add(column) if is_container(value): # GET TYPE OF MULTIVALUE v = list(value) if len(v) == 0: this_type = none_type.__name__ elif len(v) == 1: this_type = v[0].__class__.__name__ else: this_type = reduce( _merge_python_type, (vi.__class__.__name__ for vi in value) ) else: this_type = value.__class__.__name__ column.es_type = _merge_python_type(column.es_type, this_type) column.jx_type = python_type_to_json_type[column.es_type] if this_type in {"object", "dict", "Mapping", "Data"}: _get_schema_from_list( [value], table_name, full_name, nested_path, columns ) elif this_type in {"list", "FlatList"}: np = listwrap(nested_path) newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list( value, table_name, full_name, newpath, columns )
def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema query_path = split_field(name) root, rest = query_path[0], join_field(query_path[1:]) return self.get_snowflake(root).get_schema(rest)
def query_path(self): return join_field(split_field(self.name)[1:])