def format_table(T, select, query=None): data = [] num_columns = (MAX(select.put.index) + 1) for row in T: r = [None] * num_columns for s in select: value = unwraplist(row[s.pull]) if value == None: continue index, child = s.put.index, s.put.child if child == ".": r[index] = value else: if r[index] is None: r[index] = Data() r[index][child] = value data.append(r) header = [None] * num_columns for s in select: if header[s.put.index]: continue header[s.put.index] = s.name.replace("\\.", ".") return Data(meta={"format": "table"}, header=header, data=data)
def _range_composer(edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if edge.allowNulls: missing_filter = set_default( { "filter": NotOp("not", AndOp("and", [ edge.value.exists(), InequalityOp("gte", [edge.value, Literal(None, to_float(_min))]), InequalityOp("lt", [edge.value, Literal(None, to_float(_max))]) ]).partial_eval()).to_esfilter(schema) }, es_query ) else: missing_filter = None if isinstance(edge.value, Variable): calc = {"field": schema.leaves(edge.value.var)[0].es_column} else: calc = {"script": edge.value.to_painless(schema).script(schema)} return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": missing_filter }})
def _range_composer(self, edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) output = Aggs() if edge.allowNulls: output.add( FilterAggs( "_missing", NotOp( AndOp([ edge.value.exists(), GteOp([edge.value, Literal(to_float(_min))]), LtOp([edge.value, Literal(to_float(_max))]) ]).partial_eval()), self).add(es_query)) if is_op(edge.value, Variable): calc = {"field": first(schema.leaves(edge.value.var)).es_column} else: calc = {"script": text_type(Painless[edge.value].to_es_script(schema))} calc['ranges'] = [{ "from": to_float(p.min), "to": to_float(p.max) } for p in domain.partitions] return output.add(RangeAggs("_match", calc, self).add(es_query))
def get_branches(hg, branches, kwargs=None): # TRY ES cluster = elasticsearch.Cluster(branches) try: es = cluster.get_index(kwargs=branches, read_only=False) esq = jx_elasticsearch.new_instance(branches) found_branches = esq.query({"from": "branches", "format": "list", "limit": 10000}).data # IF IT IS TOO OLD, THEN PULL FROM HG oldest = Date(MAX(found_branches.etl.timestamp)) if oldest == None or Date.now() - oldest > OLD_BRANCH: found_branches = _get_branches_from_hg(hg) es.extend({"id": b.name + " " + b.locale, "value": b} for b in found_branches) es.flush() try: return UniqueIndex(["name", "locale"], data=found_branches, fail_on_dup=False) except Exception as e: Log.error("Bad branch in ES index", cause=e) except Exception as e: e = Except.wrap(e) if "Can not find index " in e: set_default(branches, {"schema": branches_schema}) es = cluster.get_or_create_index(branches) es.add_alias() return get_branches(kwargs) Log.error("problem getting branches", cause=e)
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "range" self.NULL = Null if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") parts =listwrap(self.partitions) for i, p in enumerate(parts): self.min = MIN([self.min, p.min]) self.max = MAX([self.max, p.max]) if p.dataIndex != None and p.dataIndex != i: Log.error("Expecting `dataIndex` to agree with the order of the parts") if p[self.key] == None: Log.error("Expecting all parts to have {{key}} as a property", key=self.key) p.dataIndex = i # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE for p, q in itertools.product(parts, parts): if p.min <= q.min and q.min < p.max and unwrap(p) is not unwrap(q): Log.error("partitions overlap!") self.partitions = wrap(parts) return elif any([self.min == None, self.max == None, self.interval == None]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = wrap([{"min": v, "max": v + self.interval, "dataIndex": i} for i, v in enumerate(frange(self.min, self.max, self.interval))])
def get_decoders_by_depth(query): """ RETURN A LIST OF DECODER ARRAYS, ONE ARRAY FOR EACH NESTED DEPTH """ schema = query.frum.schema output = FlatList() if query.edges: if query.sort and query.format != "cube": # REORDER EDGES/GROUPBY TO MATCH THE SORT query.edges = sort_edges(query, "edges") elif query.groupby: if query.sort and query.format != "cube": query.groupby = sort_edges(query, "groupby") for edge in wrap(coalesce(query.edges, query.groupby, [])): limit = coalesce(edge.domain.limit, query.limit, DEFAULT_LIMIT) if edge.value != None and not isinstance(edge.value, NullOp): edge = edge.copy() vars_ = edge.value.vars() for v in vars_: if not schema.leaves(v, meta=True): Log.error("{{var}} does not exist in schema", var=v) elif edge.range: vars_ = edge.range.min.vars() | edge.range.max.vars() for v in vars_: if not schema[v]: Log.error("{{var}} does not exist in schema", var=v) elif edge.domain.dimension: vars_ = edge.domain.dimension.fields edge.domain.dimension = edge.domain.dimension.copy() edge.domain.dimension.fields = [schema[v].es_column for v in vars_] elif all(edge.domain.partitions.where): vars_ = set() for p in edge.domain.partitions: vars_ |= p.where.vars() try: vars_ |= edge.value.vars() depths = set( len(c.nested_path) - 1 for v in vars_ for c in schema.leaves(v)) if -1 in depths: Log.error("Do not know of column {{column}}", column=unwraplist( [v for v in vars_ if schema[v] == None])) if len(depths) > 1: Log.error( "expression {{expr|quote}} spans tables, can not handle", expr=edge.value) max_depth = MAX(depths) while len(output) <= max_depth: output.append([]) except Exception as e: # USUALLY THE SCHEMA IS EMPTY, SO WE ASSUME THIS IS A SIMPLE QUERY max_depth = 0 output.append([]) output[max_depth].append(AggsDecoder(edge, query, limit)) return output
def get_branches(hg, branches, kwargs=None): # TRY ES try: es = elasticsearch.Cluster(kwargs=branches).get_index(kwargs=branches, read_only=False) query = { "query": {"match_all": {}}, "size": 10000 } found_branches = es.search(query).hits.hits._source # IF IT IS TOO OLD, THEN PULL FROM HG oldest = Date(MAX(found_branches.etl.timestamp)) if oldest == None or Date.now() - oldest > OLD_BRANCH: found_branches = _get_branches_from_hg(hg) es.extend({"id": b.name + " " + b.locale, "value": b} for b in found_branches) es.flush() try: return UniqueIndex(["name", "locale"], data=found_branches, fail_on_dup=False) except Exception as e: Log.error("Bad branch in ES index", cause=e) except Exception as e: if "Can not find index " in e: set_default(branches, {"schema": branches_schema}) es = elasticsearch.Cluster(kwargs=branches).get_or_create_index(kwargs=branches) es.add_alias() return get_branches(kwargs=kwargs) Log.error("problem getting branches", cause=e)
def _range_composer(edge, domain, es_query, to_float): # USE RANGES _min = coalesce(domain.min, MAX(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if isinstance(edge.value, Variable): calc = {"field": edge.value.var} else: calc = {"script_field": edge.value.to_ruby()} if edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER missing_filter = set_default( { "filter": { "or": [ OrOp("or", [ InequalityOp( "lt", [edge.value, Literal(None, to_float(_min))]), InequalityOp( "gte", [edge.value, Literal(None, to_float(_max))]), ]).to_esfilter(), edge.value.missing().to_esfilter() ] } }, es_query) else: missing_filter = None return wrap({ "aggs": { "_match": set_default({"range": calc}, { "range": { "ranges": [{ "from": to_float(p.min), "to": to_float(p.max) } for p in domain.partitions] } }, es_query), "_missing": missing_filter } })
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: last_update = MAX([ self.es_cluster.index_last_updated[i] for i in self.index_to_alias.get_domain(alias) ]) table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc(name=alias, url=None, query_path=['.'], timestamp=Date.MIN) with self.meta.tables.locker: self.meta.tables.add(table) self._reload_columns(table) elif force or table.timestamp < last_update: self._reload_columns(table) columns = self.meta.columns.find(alias, column_name) columns = jx.sort(columns, "names.\\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: if len(columns) > 10: Log.note("waiting for {{num}} columns to update", num=len([ c for c in columns if not c.last_updated ])) else: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) return []
def diff(record, index, records): """ WINDOW FUNCTIONS TAKE THE CURRENT record, THE index THAT RECORD HAS IN THE WINDOW, AND THE (SORTED) LIST OF ALL records """ # COMPARE CURRENT VALUE TO MAX OF PAST 5, BUT NOT THE VERY LAST ONE try: return record - MAX(records[index - 6:index - 1:]) except Exception, e: return None
def __init__(self, select, edges, data, frum=None): """ data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE ALLOWED, USING THE select AND edges TO DESCRIBE THE data """ self.is_value = False if isinstance(select, list) else True self.select = select self.meta = Data(format="cube") # PUT EXTRA MARKUP HERE self.is_none = False if not all(data.values()): is_none = True # ENSURE frum IS PROPER FORM if isinstance(select, list): if edges and OR(not isinstance(v, Matrix) for v in data.values()): Log.error("Expecting data to be a dict with Matrix values") if not edges: if not data: if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix.ZERO} self.edges = FlatList.EMPTY elif isinstance(data, Mapping): # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA length = MAX([len(v) for v in data.values()]) if length >= 1: self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum"}}]) else: self.edges = FlatList.EMPTY elif isinstance(data, list): if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix.wrap(data)} self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}]) elif isinstance(data, Matrix): if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: data} else: if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix(value=data)} self.edges = FlatList.EMPTY else: self.edges = wrap(edges) self.data = data
def format_table_header(select, query): num_columns = MAX(select.put.index) + 1 header = [None] * num_columns if is_data(query.select) and not is_op(query.select.value, LeavesOp): for s in select: header[s.put.index] = s.name else: for s in select: if header[s.put.index]: continue header[s.put.index] = s.name return header
def merge_schema(schema, name, value): ptype = type(value) ntype, dtype, ltype, jtype, ittype, length = python_type_to_all_types[ ptype] element = schema.element if schema.numpy_type is None: schema.numpy_type = ntype if element.type is None: element.type = dtype elif element.type is not dtype: Log.error("Expecting mathcing types") element.type_length = MAX((element.type_length, length)) return element
def _update_from_es(self, please_stop): try: last_extract = Date.now() while not please_stop: now = Date.now() try: if (now - last_extract).seconds > COLUMN_EXTRACT_PERIOD: result = self.es_index.search({ "query": { "range": { "last_updated.~n~": { "gte": self.last_load } } }, "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"], "from": 0, "size": 10000, }) last_extract = now with self.locker: for r in result.hits.hits._source: c = doc_to_column(r) if c: self._add(c) self.last_load = MAX( (self.last_load, c.last_updated)) while not please_stop: updates = self.for_es_update.pop_all() if not updates: break DEBUG and updates and Log.note( "{{num}} columns to push to db", num=len(updates)) self.es_index.extend([{ "value": column.__dict__() } for column in updates]) except Exception as e: Log.warning("problem updating database", cause=e) (Till(seconds=COLUMN_LOAD_PERIOD) | please_stop).wait() finally: Log.note("done")
def split_expression_by_depth(where, schema, output=None, var_to_depth=None): """ :param where: EXPRESSION TO INSPECT :param schema: THE SCHEMA :param output: :param var_to_depth: MAP FROM EACH VARIABLE NAME TO THE DEPTH :return: """ """ It is unfortunate that ES can not handle expressions that span nested indexes. This will split your where clause returning {"and": [filter_depth0, filter_depth1, ...]} """ vars_ = where.vars() if var_to_depth is None: if not vars_: return Null # MAP VARIABLE NAMES TO HOW DEEP THEY ARE var_to_depth = { v.var: max(len(c.nested_path) - 1, 0) for v in vars_ for c in schema[v.var] } all_depths = set(var_to_depth.values()) # if -1 in all_depths: # Log.error( # "Can not find column with name {{column|quote}}", # column=unwraplist([k for k, v in var_to_depth.items() if v == -1]) # ) if len(all_depths) == 0: all_depths = {0} output = wrap([[] for _ in range(MAX(all_depths) + 1)]) else: all_depths = set(var_to_depth[v.var] for v in vars_) if len(all_depths) == 1: output[list(all_depths)[0]] += [where] elif isinstance(where, AndOp): for a in where.terms: split_expression_by_depth(a, schema, output, var_to_depth) else: Log.error("Can not handle complex where clause") return output
def format_table(T, select, query=None): data = [] num_columns = (MAX(select.put.index) + 1) for row in T: r = [None] * num_columns for s in select: value = unwraplist(s.pull(row)) if value == None: continue index, child = s.put.index, s.put.child if child == ".": r[index] = value else: if r[index] is None: r[index] = Data() r[index][child] = value data.append(r) header = [None] * num_columns if isinstance(query.select, Mapping) and not isinstance(query.select.value, LeavesOp): for s in select: header[s.put.index] = s.name else: for s in select: if header[s.put.index]: continue if s.name == ".": header[s.put.index] = "." else: header[s.put.index] = s.name return Data( meta={"format": "table"}, header=header, data=data )
def write(profile_settings): from mo_files import File from mo_logs.convert import datetime2string from mo_math import MAX from pyLibrary.convert import list2tab profs = list(profiles.values()) for p in profs: p.stats = p.stats.end() stats = [{ "description": p.description, "num_calls": p.stats.count, "total_time": p.stats.count * p.stats.mean, "total_time_per_call": p.stats.mean } for p in profs if p.stats.count > 0] stats_file = File(profile_settings.filename, suffix=datetime2string(datetime.now(), "_%Y%m%d_%H%M%S")) if stats: stats_file.write(list2tab(stats)) else: stats_file.write("<no profiles>") stats_file2 = File(profile_settings.filename, suffix=datetime2string(datetime.now(), "_series_%Y%m%d_%H%M%S")) if not profs: return max_samples = MAX([len(p.samples) for p in profs if p.samples]) if not max_samples: return r = range(max_samples) profs.insert(0, Data(description="index", samples=r)) stats = [{p.description: wrap(p.samples)[i] for p in profs if p.samples} for i in r] if stats: stats_file2.write(list2tab(stats))
def partial_eval(self, lang): maximum = None terms = [] for t in self.terms: simple = t.partial_eval(lang) if simple is NULL: pass elif is_literal(simple): maximum = MAX([maximum, simple.value]) else: terms.append(simple) if len(terms) == 0: if maximum == None: return NULL else: return Literal(maximum) else: if maximum == None: output = (MaxOp(terms)) else: output = (MaxOp([Literal(maximum)] + terms)) return output
def _range_composer(self, edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) output = Aggs() if edge.allowNulls: output.add(FilterAggs( "_missing", NotOp("not", AndOp("and", [ edge.value.exists(), InequalityOp("gte", [edge.value, Literal(None, to_float(_min))]), InequalityOp("lt", [edge.value, Literal(None, to_float(_max))]) ]).partial_eval()), self ).add(es_query)) if isinstance(edge.value, Variable): calc = {"field": first(schema.leaves(edge.value.var)).es_column} else: calc = {"script": edge.value.to_es_script(schema).script(schema)} calc['ranges'] = [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions] return output.add(RangeAggs("_match", calc, self).add(es_query))
def row_formatter(select): # RETURN A FUNCTION THAT RETURNS A FORMATTED ROW select = listwrap(select) num_columns = MAX(select.put.index) + 1 def format_row(doc): row = [None] * num_columns for s in select: value = unwraplist(s.pull(doc)) if value == None: continue index, child = s.put.index, s.put.child if child == ".": row[index] = value else: if row[index] is None: row[index] = Data() row[index][child] = value return row return format_row
def _get_spot_prices_from_aws(self): with Timer("Read no capacity file"): try: # FILE IS LIST OF {instance_type, last_failure} OBJECTS content = self.no_capacity_file.read() self.no_capacity = dict( (r.instance_type, r.last_failure) for r in convert.json2value( content, flexible=False, leaves=False)) except Exception as e: self.no_capacity = {} with Timer("Read pricing file"): try: content = File(self.settings.price_file).read() cache = convert.json2value(content, flexible=False, leaves=False) except Exception as e: cache = FlatList() cache = ListContainer(name=None, data=cache) most_recents = jx.run({ "from": cache, "edges": ["instance_type", "availability_zone"], "select": { "value": "timestamp", "aggregate": "max" } }) zones = self._get_valid_availability_zones() prices = set(cache) with Timer("Get pricing from AWS"): for instance_type in self.settings.utility.keys(): for zone in zones: if cache: most_recent = most_recents[{ "instance_type": instance_type, "availability_zone": zone }].timestamp start_at = MAX( [Date(most_recent), Date.today() - WEEK]) else: start_at = Date.today() - WEEK if DEBUG_PRICING: Log.note( "get pricing for {{instance_type}} starting at {{start_at}}", instance_type=instance_type, start_at=start_at) next_token = None while True: resultset = self.ec2_conn.get_spot_price_history( product_description=coalesce( self.settings.product, "Linux/UNIX (Amazon VPC)"), instance_type=instance_type, availability_zone=zone, start_time=start_at.format(ISO8601), next_token=next_token) next_token = resultset.next_token for p in resultset: prices.add( wrap({ "availability_zone": p.availability_zone, "instance_type": p.instance_type, "price": p.price, "product_description": p.product_description, "region": p.region.name, "timestamp": Date(p.timestamp).unix })) if not next_token: break with Timer("Save prices to file"): new_prices = jx.filter( prices, {"gte": { "timestamp": { "date": "today-2day" } }}) def stream(): # IT'S A LOT OF PRICES, STREAM THEM TO FILE prefix = "[\n" for p in new_prices: yield prefix yield convert.value2json(p) prefix = ",\n" yield "]" File(self.settings.price_file).write(stream()) return ListContainer(name="prices", data=prices)
def get_decoders_by_depth(query): """ RETURN A LIST OF DECODER ARRAYS, ONE ARRAY FOR EACH NESTED DEPTH """ schema = query.frum.schema output = FlatList() if query.sort: # REORDER EDGES/GROUPBY TO MATCH THE SORT if query.edges: Log.error( "can not use sort clause with edges: add sort clause to each edge" ) ordered_edges = [] remaining_edges = copy(query.groupby) for s in query.sort: if not isinstance(s.value, Variable): Log.error("can only sort by terms") for e in remaining_edges: if e.value.var == s.value.var: ordered_edges.append(e) remaining_edges.remove(e) break ordered_edges.extend(remaining_edges) query.groupby = wrap(list(reversed(ordered_edges))) for edge in wrap(coalesce(query.edges, query.groupby, [])): if edge.value != None and not isinstance(edge.value, NullOp): edge = edge.copy() vars_ = edge.value.vars() for v in vars_: if not schema[v]: Log.error("{{var}} does not exist in schema", var=v) edge.value = edge.value.map( {c.name: c.es_column for v in vars_ for c in schema.lookup[v]}) elif edge.range: edge = edge.copy() min_ = edge.range.min max_ = edge.range.max vars_ = min_.vars() | max_.vars() for v in vars_: if not schema[v]: Log.error("{{var}} does not exist in schema", var=v) map_ = {c.name: c.es_column for v in vars_ for c in schema[v]} edge.range = {"min": min_.map(map_), "max": max_.map(map_)} elif edge.domain.dimension: vars_ = edge.domain.dimension.fields edge.domain.dimension = edge.domain.dimension.copy() edge.domain.dimension.fields = [schema[v].es_column for v in vars_] elif all(edge.domain.partitions.where): vars_ = set() for p in edge.domain.partitions: vars_ |= p.where.vars() try: depths = set( len(c.nested_path) - 1 for v in vars_ for c in schema[v]) if -1 in depths: Log.error("Do not know of column {{column}}", column=unwraplist( [v for v in vars_ if schema[v] == None])) if len(depths) > 1: Log.error("expression {{expr}} spans tables, can not handle", expr=edge.value) max_depth = MAX(depths) while len(output) <= max_depth: output.append([]) except Exception, edge: # USUALLY THE SCHEMA IS EMPTY, SO WE ASSUME THIS IS A SIMPLE QUERY max_depth = 0 output.append([]) limit = 0 output[max_depth].append(AggsDecoder(edge, query, limit))
def _get_spot_prices_from_aws(self): with Timer("Read pricing file"): try: content = File(self.settings.price_file).read() cache = convert.json2value(content, flexible=False, leaves=False) except Exception as e: cache = FlatList() most_recents = jx.run({ "from": cache, "edges": ["instance_type", "availability_zone"], "select": {"value": "timestamp", "aggregate": "max"} }) zones = self._get_valid_availability_zones() prices = set(cache) with Timer("Get pricing from AWS"): for instance_type in self.settings.utility.keys(): for zone in zones: if cache: most_recent = most_recents[{ "instance_type": instance_type, "availability_zone": zone }].timestamp start_at = MAX([Date(most_recent), Date.today() - WEEK]) else: start_at = Date.today() - WEEK if DEBUG_PRICING: Log.note("get pricing for {{instance_type}} starting at {{start_at}}", instance_type=instance_type, start_at=start_at ) next_token = None while True: resultset = self.ec2_conn.get_spot_price_history( product_description=coalesce(self.settings.product, "Linux/UNIX (Amazon VPC)"), instance_type=instance_type, availability_zone=zone, start_time=start_at.format(ISO8601), next_token=next_token ) next_token = resultset.next_token for p in resultset: prices.add(wrap({ "availability_zone": p.availability_zone, "instance_type": p.instance_type, "price": p.price, "product_description": p.product_description, "region": p.region.name, "timestamp": Date(p.timestamp).unix })) if not next_token: break with Timer("Save prices to file"): new_prices = jx.filter(prices, {"gte": {"timestamp": {"date": "today-2day"}}}) def stream(): # IT'S A LOT OF PRICES, STREAM THEM TO FILE prefix = "[\n" for p in new_prices: yield prefix yield convert.value2json(p) prefix = ",\n" yield "]" File(self.settings.price_file).write(stream()) return prices
def _set_op(self, query, frum): # GET LIST OF COLUMNS base_name, primary_nested_path = tail_field(frum) vars_ = UNION([ v.var for select in listwrap(query.select) for v in select.value.vars() ]) schema = self.sf.tables[primary_nested_path].schema active_columns = {".": set()} for v in vars_: for c in schema.leaves(v): nest = c.nested_path[0] active_columns.setdefault(nest, set()).add(c) # ANY VARS MENTIONED WITH NO COLUMNS? for v in vars_: if not any(startswith_field(cname, v) for cname in schema.keys()): active_columns["."].add( Column(name=v, jx_type="null", es_column=".", es_index=".", nested_path=["."])) # EVERY COLUMN, AND THE INDEX IT TAKES UP index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) index_to_uid = {} # FROM NESTED PATH TO THE INDEX OF UID sql_selects = [ ] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } sorts = [] if query.sort: for select in query.sort: col = select.value.to_sql(schema)[0] for t, sql in col.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) sql_selects.append(sql_alias(sql, column_alias)) if select.sort == -1: sorts.append(column_alias + SQL_IS_NOT_NULL) sorts.append(column_alias + " DESC") else: sorts.append(column_alias + SQL_IS_NULL) sorts.append(column_alias) primary_doc_details = Data() # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED nested_path = [] for step, sub_table in self.sf.tables.items(): nested_path.insert(0, step) nested_doc_details = { "sub_table": sub_table, "children": [], "index_to_column": {}, "nested_path": nested_path } # INSERT INTO TREE if not primary_doc_details: primary_doc_details = nested_doc_details else: def place(parent_doc_details): if startswith_field(step, parent_doc_details['nested_path'][0]): for c in parent_doc_details['children']: if place(c): return True parent_doc_details['children'].append( nested_doc_details) place(primary_doc_details) alias = nested_doc_details['alias'] = nest_to_alias[step] # WE ALWAYS ADD THE UID column_number = index_to_uid[step] = nested_doc_details[ 'id_coord'] = len(sql_selects) sql_select = join_column(alias, quoted_UID) sql_selects.append( sql_alias(sql_select, _make_column_name(column_number))) if step != ".": # ID AND ORDER FOR CHILD TABLES index_to_column[column_number] = ColumnMapping( sql=sql_select, type="number", nested_path=nested_path, column_alias=_make_column_name(column_number)) column_number = len(sql_selects) sql_select = join_column(alias, quoted_ORDER) sql_selects.append( sql_alias(sql_select, _make_column_name(column_number))) index_to_column[column_number] = ColumnMapping( sql=sql_select, type="number", nested_path=nested_path, column_alias=_make_column_name(column_number)) # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM if step not in active_columns: continue # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE si = 0 for select in listwrap(query.select): try: column_number = len(sql_selects) select.pull = get_column(column_number) db_columns = select.value.partial_eval().to_sql(schema) for column in db_columns: if isinstance(column.nested_path, list): column.nested_path = column.nested_path[ 0] # IN THE EVENT THIS "column" IS MULTIVALUED for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) column_alias = _make_column_name(column_number) sql_selects.append( sql_alias(unsorted_sql, column_alias)) if startswith_field(primary_nested_path, step) and isinstance( select.value, LeavesOp): # ONLY FLATTEN primary_nested_path AND PARENTS, NOT CHILDREN index_to_column[ column_number] = nested_doc_details[ 'index_to_column'][ column_number] = ColumnMapping( push_name=literal_field( get_property_name( concat_field( select.name, column.name))), push_child=".", push_column_name= get_property_name( concat_field( select.name, column.name)), push_column=si, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=nested_path) si += 1 else: index_to_column[ column_number] = nested_doc_details[ 'index_to_column'][ column_number] = ColumnMapping( push_name=select.name, push_child=column.name, push_column_name=select.name, push_column=si, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=nested_path) finally: si += 1 where_clause = BooleanOp("boolean", query.where).partial_eval().to_sql( schema, boolean=True)[0].sql.b unsorted_sql = self._make_sql_for_one_nest_in_set_op( ".", sql_selects, where_clause, active_columns, index_to_column) for n, _ in self.sf.tables.items(): sorts.append(quote_column(COLUMN + text_type(index_to_uid[n]))) ordered_sql = (SQL_SELECT + "*" + SQL_FROM + sql_iso(unsorted_sql) + SQL_ORDERBY + sql_list(sorts) + SQL_LIMIT + quote_value(query.limit)) self.db.create_new_functions() # creating new functions: regexp result = self.db.query(ordered_sql) def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Data() output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append( row ) # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc) return output if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Data() curr_nested_path = nested_doc_details['nested_path'][0] index_to_column = nested_doc_details[ 'index_to_column'].items() if index_to_column: for i, c in index_to_column: value = row[i] if isinstance(query.select, list) or isinstance( query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_field = concat_field( c.push_name, c.push_child) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_field = c.push_child if relative_field == ".": if value == '': doc = Null else: doc = value elif value != None and value != '': doc[relative_field] = value for child_details in nested_doc_details['children']: # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested( rows, row, child_details, doc_id, id_coord) if nested_value: push_name = child_details['nested_path'][0] if isinstance(query.select, list) or isinstance( query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_field = relative_field( push_name, curr_nested_path) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_field = "." if relative_field == "." and doc is Null: doc = nested_value elif relative_field == ".": doc = unwraplist(nested_value) else: doc[relative_field] = unwraplist(nested_value) output.append(doc) try: row = rows.pop() except IndexError: return output cols = tuple( [i for i in index_to_column.values() if i.push_name != None]) rows = list(reversed(unwrap(result.data))) if rows: row = rows.pop() data = _accumulate_nested(rows, row, primary_doc_details, None, None) else: data = result.data if query.format == "cube": for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): num_rows = len(result.data) num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0 map_index_to_name = { c.push_column: c.push_column_name for c in cols } temp_data = [[None] * num_rows for _ in range(num_cols)] for rownum, d in enumerate(result.data): for c in cols: if c.push_child == ".": temp_data[c.push_column][rownum] = c.pull(d) else: column = temp_data[c.push_column][rownum] if column is None: column = temp_data[ c.push_column][rownum] = {} column[c.push_child] = c.pull(d) output = Data(meta={"format": "cube"}, data={ n: temp_data[c] for c, n in map_index_to_name.items() }, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }]) return output if isinstance(query.select, list) or isinstance( query.select.value, LeavesOp): num_rows = len(data) temp_data = { c.push_column_name: [None] * num_rows for c in cols } for rownum, d in enumerate(data): for c in cols: temp_data[c.push_column_name][rownum] = d[c.push_name] return Data(meta={"format": "cube"}, data=temp_data, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }]) else: num_rows = len(data) map_index_to_name = { c.push_column: c.push_column_name for c in cols } temp_data = [data] return Data(meta={"format": "cube"}, data={ n: temp_data[c] for c, n in map_index_to_name.items() }, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }]) elif query.format == "table": for f, _ in self.sf.tables.items(): if frum.endswith(f): num_column = MAX([c.push_column for c in cols]) + 1 header = [None] * num_column for c in cols: header[c.push_column] = c.push_column_name output_data = [] for d in result.data: row = [None] * num_column for c in cols: set_column(row, c.push_column, c.push_child, c.pull(d)) output_data.append(row) return Data(meta={"format": "table"}, header=header, data=output_data) if isinstance(query.select, list) or isinstance( query.select.value, LeavesOp): column_names = [None] * (max(c.push_column for c in cols) + 1) for c in cols: column_names[c.push_column] = c.push_column_name temp_data = [] for rownum, d in enumerate(data): row = [None] * len(column_names) for c in cols: row[c.push_column] = d[c.push_name] temp_data.append(row) return Data(meta={"format": "table"}, header=column_names, data=temp_data) else: column_names = listwrap(query.select).name return Data(meta={"format": "table"}, header=column_names, data=[[d] for d in data]) else: for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): data = [] for d in result.data: row = Data() for c in cols: if c.push_child == ".": row[c.push_name] = c.pull(d) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [ None ] * c.num_push_columns tuple_value[c.push_child] = c.pull(d) else: row[c.push_name][c.push_child] = c.pull(d) data.append(row) return Data(meta={"format": "list"}, data=data) if isinstance(query.select, list) or isinstance( query.select.value, LeavesOp): temp_data = [] for rownum, d in enumerate(data): row = {} for c in cols: row[c.push_column_name] = d[c.push_name] temp_data.append(row) return Data(meta={"format": "list"}, data=temp_data) else: return Data(meta={"format": "list"}, data=data)
def test_max(self): date = Date("2015-10-04 13:53:11", '%Y-%m-%d %H:%M:%S.%f') self.assertEqual(MAX([None, date]), date)
def _set_op(self, query, frum): # GET LIST OF COLUMNS primary_nested_path = join_field(split_field(frum)[1:]) vars_ = UNION([s.value.vars() for s in listwrap(query.select)]) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.nested_tables.items()) } active_columns = {".": []} for cname, cols in self.columns.items(): if any(startswith_field(cname, v) for v in vars_): for c in cols: if c.type in STRUCT: continue nest = c.nested_path[0] active = active_columns.get(nest) if not active: active = active_columns[nest] = [] active.append(c) # ANY VARS MENTIONED WITH NO COLUMNS? for v in vars_: if not any( startswith_field(cname, v) for cname in self.columns.keys()): active_columns["."].append( Column(names={self.name: v}, type="null", es_column=".", es_index=".", nested_path=["."])) # EVERY COLUMN, AND THE INDEX IT TAKES UP index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) index_to_uid = {} # FROM NESTED PATH TO THE INDEX OF UID sql_selects = [ ] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.nested_tables.items()) } sorts = [] if query.sort: for s in query.sort: col = s.value.to_sql(self)[0] for t, sql in col.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) sql_selects.append(sql + " AS " + column_alias) if s.sort == -1: sorts.append(column_alias + " IS NOT NULL") sorts.append(column_alias + " DESC") else: sorts.append(column_alias + " IS NULL") sorts.append(column_alias) primary_doc_details = Data() # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED for nested_path, sub_table in self.nested_tables.items(): nested_doc_details = { "sub_table": sub_table, "children": [], "index_to_column": {}, "nested_path": [nested_path ] # fake the real nested path, we only look at [0] anyway } # INSERT INTO TREE if not primary_doc_details: primary_doc_details = nested_doc_details else: def place(parent_doc_details): if startswith_field(nested_path, parent_doc_details['nested_path'][0]): for c in parent_doc_details['children']: if place(c): return True parent_doc_details['children'].append( nested_doc_details) place(primary_doc_details) alias = nested_doc_details['alias'] = nest_to_alias[nested_path] # WE ALWAYS ADD THE UID AND ORDER column_number = index_to_uid[nested_path] = nested_doc_details[ 'id_coord'] = len(sql_selects) sql_select = alias + "." + quoted_UID sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) if nested_path != ".": sql_select = alias + "." + quote_table(ORDER) sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM if nested_path not in active_columns: continue if primary_nested_path == nested_path: # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE si = 0 for s in listwrap(query.select): try: column_number = len(sql_selects) s.pull = get_column(column_number) db_columns = s.value.to_sql(self) if isinstance(s.value, LeavesOp): for column in db_columns: for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name( column_number) sql_selects.append(unsorted_sql + " AS " + column_alias) index_to_column[ column_number] = nested_doc_details[ 'index_to_column'][ column_number] = Data( push_name=concat_field( s.name, column.name), push_column=si, push_child=".", pull=get_column( column_number), sql=unsorted_sql, type=json_type, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) si += 1 else: for column in db_columns: for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name( column_number) sql_selects.append(unsorted_sql + " AS " + column_alias) index_to_column[ column_number] = nested_doc_details[ 'index_to_column'][ column_number] = Data( push_name=s.name, push_column=si, push_child=column.name, pull=get_column( column_number), sql=unsorted_sql, type=json_type, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) finally: si += 1 elif startswith_field(nested_path, primary_nested_path): # ADD REQUIRED COLUMNS, FOR DEEP STUFF for ci, c in enumerate(active_columns[nested_path]): if c.type in STRUCT: continue column_number = len(sql_selects) nested_path = c.nested_path unsorted_sql = nest_to_alias[ nested_path[0]] + "." + quote_table(c.es_column) column_alias = _make_column_name(column_number) sql_selects.append(unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details[ 'index_to_column'][column_number] = Data( push_name=s.name, push_column=si, push_child=relative_field(c.name, s.name), pull=get_column(column_number), sql=unsorted_sql, type=c.type, nested_path=nested_path) where_clause = query.where.to_sql(self, boolean=True)[0].sql.b unsorted_sql = self._make_sql_for_one_nest_in_set_op( ".", sql_selects, where_clause, active_columns, index_to_column) for n, _ in self.nested_tables.items(): sorts.append(COLUMN + unicode(index_to_uid[n])) ordered_sql = ("SELECT * FROM (\n" + unsorted_sql + "\n)" + "\nORDER BY\n" + ",\n".join(sorts) + "\nLIMIT " + quote_value(query.limit)) result = self.db.query(ordered_sql) def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Data() output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append(row) # UNDO output = unwraplist(output) return output if output else None if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Data() curr_nested_path = nested_doc_details['nested_path'][0] if isinstance(query.select, list) or isinstance( query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES for i, c in nested_doc_details[ 'index_to_column'].items(): value = row[i] if value == None: continue if value == '': continue relative_path = relative_field( concat_field(c.push_name, c.push_child), curr_nested_path) if relative_path == ".": doc = value else: doc[relative_path] = value else: # ASSIGN INNER PROPERTIES for i, c in nested_doc_details[ 'index_to_column'].items(): value = row[i] if value is not None: relative_path = relative_field( c.push_child, curr_nested_path) if relative_path == ".": doc = value else: doc[relative_path] = value output.append(doc) # ASSIGN NESTED ARRAYS for child_details in nested_doc_details['children']: child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested( rows, row, child_details, doc_id, id_coord) if nested_value is not None: path = child_details['nested_path'][0] doc[path] = nested_value try: row = rows.pop() except IndexError: output = unwraplist(output) return output if output else None cols = tuple(index_to_column.values()) if query.format == "cube": num_rows = len(result.data) num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0 map_index_to_name = {c.push_column: c.push_name for c in cols} temp_data = [[None] * num_rows for _ in range(num_cols)] for rownum, d in enumerate(result.data): for c in cols: if c.push_child == ".": temp_data[c.push_column][rownum] = c.pull(d) else: column = temp_data[c.push_column][rownum] if column is None: column = temp_data[c.push_column][rownum] = {} column[c.push_child] = c.pull(d) output = Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }]) return output elif query.format == "table": num_column = MAX([c.push_column for c in cols]) + 1 header = [None] * num_column for c in cols: # header[c.push_column] = c.push_name sf = split_field(c.push_name) if len(sf) == 0: header[c.push_column] = "." elif len(sf) == 1: header[c.push_column] = sf[0] else: # TABLES ONLY USE THE FIRST-LEVEL PROPERTY NAMES # PUSH ALL DEEPER NAMES TO CHILD header[c.push_column] = sf[0] c.push_child = join_field(sf[1:] + split_field(c.push_child)) output_data = [] for d in result.data: row = [None] * num_column for c in cols: set_column(row, c.push_column, c.push_child, c.pull(d)) output_data.append(row) return Data(meta={"format": "table"}, header=header, data=output_data) else: rows = list(reversed(unwrap(result.data))) row = rows.pop() output = Data(meta={"format": "list"}, data=listwrap( _accumulate_nested(rows, row, primary_doc_details, None, None))) return output
def _set_op(self, query, frum): # GET LIST OF COLUMNS frum_path = split_field(frum) primary_nested_path = join_field(frum_path[1:]) vars_ = UNION([s.value.vars() for s in listwrap(query.select)]) schema = self.sf.tables[primary_nested_path].schema nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } active_columns = {".": []} for cname, cols in schema.items(): if any(startswith_field(cname, v) for v in vars_): for c in cols: if c.type in STRUCT: continue nest = c.nested_path[0] active = active_columns.get(nest) if not active: active = active_columns[nest] = [] active.append(c) for nested_path, s in self.sf.tables.items(): for cname, cols in s.schema.items(): if not any(startswith_field(cname, c.names[c.nested_path[0]]) for n, cc in active_columns.items() for c in cc): for c in cols: if c.type in STRUCT: continue nest = c.nested_path[0] active = active_columns.get(nest) if not active: active = active_columns[nest] = [] active.append(c) # ANY VARS MENTIONED WITH NO COLUMNS? for v in vars_: if not any(startswith_field(cname, v) for cname in schema.keys()): active_columns["."].append(Column( names={".": v}, type="null", es_column=".", es_index=".", nested_path=["."] )) # EVERY COLUMN, AND THE INDEX IT TAKES UP index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) index_to_uid = {} # FROM NESTED PATH TO THE INDEX OF UID sql_selects = [] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } sorts = [] if query.sort: for s in query.sort: col = s.value.to_sql(schema)[0] for t, sql in col.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) sql_selects.append(sql + " AS " + column_alias) if s.sort == -1: sorts.append(column_alias + " IS NOT NULL") sorts.append(column_alias + " DESC") else: sorts.append(column_alias + " IS NULL") sorts.append(column_alias) selects = [] primary_doc_details = Data() # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED for nested_path, sub_table in self.sf.tables.items(): nested_doc_details = { "sub_table": sub_table, "children": [], "index_to_column": {}, "nested_path": [nested_path] # fake the real nested path, we only look at [0] anyway } # INSERT INTO TREE if not primary_doc_details: primary_doc_details = nested_doc_details else: def place(parent_doc_details): if startswith_field(nested_path, parent_doc_details['nested_path'][0]): for c in parent_doc_details['children']: if place(c): return True parent_doc_details['children'].append(nested_doc_details) place(primary_doc_details) alias = nested_doc_details['alias'] = nest_to_alias[nested_path] if nested_path=="." and quoted_GUID in vars_: column_number = index_to_uid[nested_path] = nested_doc_details['id_coord'] = len(sql_selects) sql_select = alias + "." + quoted_GUID sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name="_id", push_column_name="_id", push_column=0, push_child=".", sql=sql_select, pull=get_column(column_number), type="string", column_alias=_make_column_name(column_number), nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) query.select = [s for s in listwrap(query.select) if s.name!="_id"] # WE ALWAYS ADD THE UID AND ORDER column_number = index_to_uid[nested_path] = nested_doc_details['id_coord'] = len(sql_selects) sql_select = alias + "." + quoted_UID sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) if nested_path !=".": index_to_column[column_number]=ColumnMapping( sql=sql_select, type="number", nested_path=[nested_path], # fake the real nested path, we only look at [0] anyway column_alias=_make_column_name(column_number) ) column_number = len(sql_selects) sql_select = alias + "." + quote_table(ORDER) sql_selects.append(sql_select + " AS " + _make_column_name(column_number)) index_to_column[column_number]=ColumnMapping( sql=sql_select, type="number", nested_path=[nested_path], # fake the real nested path, we only look at [0] anyway column_alias=_make_column_name(column_number) ) # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM if nested_path not in active_columns: continue if len(active_columns[nested_path]) != 0: # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE si = 0 for s in listwrap(query.select): try: column_number = len(sql_selects) s.pull = get_column(column_number) db_columns = s.value.to_sql(schema) if isinstance(s.value, LeavesOp): for column in db_columns: if isinstance(column.nested_path, list): column.nested_path=column.nested_path[0] if column.nested_path and column.nested_path!=nested_path: continue for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=literal_field(get_property_name(concat_field(s.name, column.name))), push_column_name=get_property_name(concat_field(s.name, column.name)), push_column=si, push_child=".", pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) si += 1 else: for column in db_columns: if isinstance(column.nested_path, list): column.nested_path=column.nested_path[0] if column.nested_path and column.nested_path!=nested_path: continue for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=column.name, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=[nested_path] # fake the real nested path, we only look at [0] anyway ) finally: si += 1 elif startswith_field(nested_path, primary_nested_path): # ADD REQUIRED COLUMNS, FOR DEEP STUFF for ci, c in enumerate(active_columns[nested_path]): if c.type in STRUCT: continue column_number = len(sql_selects) nested_path = c.nested_path unsorted_sql = nest_to_alias[nested_path[0]] + "." + quote_table(c.es_column) column_alias = _make_column_name(column_number) if concat_field(alias, unsorted_sql) in selects and len(unsorted_sql.split())==1: continue selects.append(concat_field(alias, unsorted_sql)) sql_selects.append(alias + "." + unsorted_sql + " AS " + column_alias) index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=s.name, push_column_name=s.name, push_column=si, push_child=relative_field(c.names["."], s.name), pull=get_column(column_number), sql=unsorted_sql, type=c.type, column_alias=column_alias, nested_path=nested_path ) where_clause = query.where.to_sql(schema, boolean=True)[0].sql.b unsorted_sql = self._make_sql_for_one_nest_in_set_op( ".", sql_selects, where_clause, active_columns, index_to_column ) for n, _ in self.sf.tables.items(): sorts.append(COLUMN + text_type(index_to_uid[n])) ordered_sql = ( "SELECT * FROM (\n" + unsorted_sql + "\n)" + "\nORDER BY\n" + ",\n".join(sorts) + "\nLIMIT " + quote_value(query.limit) ) self.db.create_new_functions() #creating new functions: regexp result = self.db.query(ordered_sql) def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Null output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append(row) # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc) return output if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Null curr_nested_path = nested_doc_details['nested_path'][0] index_to_column = nested_doc_details['index_to_column'].items() if index_to_column: for i, c in index_to_column: value = row[i] if value == None: continue if value == '': continue if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path=join_field([c.push_name]+split_field(c.push_child)) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path=c.push_child if relative_path == ".": doc = value elif doc is Null: doc = Data() doc[relative_path] = value else: doc[relative_path] = value for child_details in nested_doc_details['children']: # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested(rows, row, child_details, doc_id, id_coord) if nested_value: push_name = child_details['nested_path'][0] if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path=relative_field(push_name, curr_nested_path) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path="." if relative_path == "." and doc is Null: doc = nested_value elif relative_path == ".": doc[push_name] = unwraplist([v[push_name] for v in nested_value]) elif doc is Null: doc = Data() doc[relative_path] = unwraplist(nested_value) else: doc[relative_path] = unwraplist(nested_value) output.append(doc) try: row = rows.pop() except IndexError: return output cols = tuple([i for i in index_to_column.values() if i.push_name != None]) rows = list(reversed(unwrap(result.data))) if rows: row = rows.pop() data = _accumulate_nested(rows, row, primary_doc_details, None, None) else: data = result.data if query.format == "cube": for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): num_rows = len(result.data) num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0 map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = [[None]*num_rows for _ in range(num_cols)] for rownum, d in enumerate(result.data): for c in cols: if c.push_child == ".": temp_data[c.push_column][rownum] = c.pull(d) else: column = temp_data[c.push_column][rownum] if column is None: column = temp_data[c.push_column][rownum] = Data() column[c.push_child] = c.pull(d) output = Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) return output if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): num_rows = len(data) map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = Data() for rownum, d in enumerate(data): for k, v in d.items(): if temp_data[k] == None: temp_data[k] = [None] * num_rows temp_data[k][rownum] = v return Data( meta={"format": "cube"}, data={n: temp_data[literal_field(n)] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) else: num_rows = len(data) map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = [data] return Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) elif query.format == "table": for f, _ in self.sf.tables.items(): if frum.endswith(f): num_column = MAX([c.push_column for c in cols])+1 header = [None]*num_column for c in cols: header[c.push_column] = c.push_column_name output_data = [] for d in result.data: row = [None] * num_column for c in cols: set_column(row, c.push_column, c.push_child, c.pull(d)) output_data.append(row) return Data( meta={"format": "table"}, header=header, data=output_data ) if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): num_rows = len(data) column_names= [None]*(max(c.push_column for c in cols) + 1) for c in cols: column_names[c.push_column] = c.push_column_name temp_data = [] for rownum, d in enumerate(data): row =[None] * len(column_names) for i, (k, v) in enumerate(sorted(d.items())): for c in cols: if k==c.push_name: row[c.push_column] = v temp_data.append(row) return Data( meta={"format": "table"}, header=column_names, data=temp_data ) else: column_names = listwrap(query.select).name return Data( meta={"format": "table"}, header=column_names, data=[[d] for d in data] ) else: for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): data = [] for d in result.data: row = Data() for c in cols: if c.push_child == ".": row[c.push_name] = c.pull(d) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(d) elif not isinstance(query.select, list): # select is value type row[c.push_child]=c.pull(d) else: row[c.push_name][c.push_child] = c.pull(d) data.append(row) return Data( meta={"format": "list"}, data=data ) if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): temp_data=[] for rownum, d in enumerate(data): row = {} for k, v in d.items(): for c in cols: if c.push_name==c.push_column_name==k: row[c.push_column_name] = v elif c.push_name==k and c.push_column_name!=k: row[c.push_column_name] = v temp_data.append(row) return Data( meta={"format": "list"}, data=temp_data ) else: return Data( meta={"format": "list"}, data=data )