def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if sort == None: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring): output.append({"value": jx_expression(s), "sort": 1}) elif isinstance(s, Expression): output.append({"value": s, "sort": 1}) elif Math.is_integer(s): output.append({"value": OffsetOp("offset", s), "sort": 1}) elif all(d in sort_direction for d in s.values()) and not s.sort and not s.value: for v, d in s.items(): output.append({"value": jx_expression(v), "sort": -1}) else: output.append({ "value": jx_expression(coalesce(s.value, s.field)), "sort": coalesce(sort_direction[s.sort], 1) }) return output
def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{"index": i, "value": p} for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = DictList() for c, s in enumerate(select): data = Matrix(*[len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0]
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce(kwargs.depth, len(self.fields)-1 if isinstance(self.fields, list) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = DictList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name":part.name, "value":part.value, "where":part.where, "style":coalesce(part.style, part.parent.style), "weight":part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name":v.name, "value":v.value, "where":v.where, "style":v.style, "weight":v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT)] elif kwargs.depth == 1: partitions = DictList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name":join_field(split_field(subpart.parent.name) + [subpart.name]), "value":subpart.value, "where":subpart.where, "style":coalesce(subpart.style, subpart.parent.style), "weight":subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception, e: Log.error("", e)
def _select(template, data, fields, depth): output = DictList() deep_path = [] deep_fields = UniqueIndex(["name"]) for d in data: if isinstance(d, Dict): Log.error("programmer error, _select can not handle Dict") record = template.copy() children = None for f in fields: index, c = _select_deep(d, f, depth, record) children = c if children is None else children if index: path = f.value[0:index:] if not deep_fields[f]: deep_fields.add(f) # KEEP TRACK OF WHICH FIELDS NEED DEEPER SELECT short = MIN(len(deep_path), len(path)) if path[:short:] != deep_path[:short:]: Log.error("Dangerous to select into more than one branch at time") if len(deep_path) < len(path): deep_path = path if not children: output.append(record) else: output.extend(_select(record, children, deep_fields, depth + 1)) return output
class _Stats(WindowFunction): """ TRACK STATS, BUT IGNORE OUTLIERS """ def __init__(self, middle=None, *args, **kwargs): object.__init__(self) self.middle = middle self.samples = DictList() def add(self, value): if value == None: return self.samples.append(value) def sub(self, value): if value == None: return self.samples.remove(value) def merge(self, agg): Log.error("Do not know how to handle") def end(self): ignore = Math.ceiling(len(self.samples) * (1 - self.middle) / 2) if ignore * 2 >= len(self.samples): return stats.Stats() output = stats.Stats(samples=sorted(self.samples)[ignore:len(self.samples) - ignore:]) output.samples = list(self.samples) return output
def _select(template, data, fields, depth): output = DictList() deep_path = [] deep_fields = UniqueIndex(["name"]) for d in data: if isinstance(d, Dict): Log.error("programmer error, _select can not handle Dict") record = template.copy() children = None for f in fields: index, c = _select_deep(d, f, depth, record) children = c if children is None else children if index: path = f.value[0:index:] if not deep_fields[f]: deep_fields.add( f) # KEEP TRACK OF WHICH FIELDS NEED DEEPER SELECT short = MIN([len(deep_path), len(path)]) if path[:short:] != deep_path[:short:]: Log.error( "Dangerous to select into more than one branch at time" ) if len(deep_path) < len(path): deep_path = path if not children: output.append(record) else: output.extend(_select(record, children, deep_fields, depth + 1)) return output
def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{ "index": i, "value": p } for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = DictList() for c, s in enumerate(select): data = Matrix(*[ len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges ]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0]
def select(self, fields): if isinstance(fields, Mapping): fields=fields.value if isinstance(fields, basestring): # RETURN LIST OF VALUES if len(split_field(fields)) == 1: if self.path[0] == fields: return [d[1] for d in self.data] else: return [d[0][fields] for d in self.data] else: keys = split_field(fields) depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX short_key = keys[depth:] output = DictList() _select1((wrap(d[depth]) for d in self.data), short_key, 0, output) return output if isinstance(fields, list): output = DictList() meta = [] for f in fields: if hasattr(f.value, "__call__"): meta.append((f.name, f.value)) else: meta.append((f.name, functools.partial(lambda v, d: d[v], f.value))) for row in self._values(): agg = Dict() for name, f in meta: agg[name] = f(row) output.append(agg) return output # meta = [] # for f in fields: # keys = split_field(f.value) # depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX # short_key = join_field(keys[depth:]) # # meta.append((f.name, depth, short_key)) # # for row in self._data: # agg = Dict() # for name, depth, short_key in meta: # if short_key: # agg[name] = row[depth][short_key] # else: # agg[name] = row[depth] # output.append(agg) # return output Log.error("multiselect over FlatList not supported")
def more(): output = DictList() for i in range(size): try: output.append(iterator.next()) except StopIteration: done.append(True) break return output
def _aggop(self, query): """ SINGLE ROW RETURNED WITH AGGREGATES """ if isinstance(query.select, list): # RETURN SINGLE OBJECT WITH AGGREGATES for s in query.select: if s.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s) selects = DictList() for s in query.select: selects.append(aggregates[s.aggregate].replace("{{code}}", s.value) + " AS " + self.db.quote_column(s.name)) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} """, { "selects": SQL(",\n".join(selects)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.filter) }) return sql, lambda sql: self.db.column(sql)[0] # RETURNING SINGLE OBJECT WITH AGGREGATE VALUES else: # RETURN SINGLE VALUE s0 = query.select if s0.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s0) select = aggregates[s0.aggregate].replace("{{code}}", s0.value) + " AS " + self.db.quote_column(s0.name) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} """, { "selects": SQL(select), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where) }) def post(sql): result = self.db.column_query(sql) return result[0][0] return sql, post # RETURN SINGLE VALUE
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self._es.get_schema() # GET IDS OF DOCUMENTS results = self._es.search( { "fields": listwrap(schema._routing.path), "query": { "filtered": {"query": {"match_all": {}}, "filter": jx_expression(command.where).to_esfilter()} }, "size": 200000, } ) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = DictList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_ruby()}) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append( { "update": { "_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)]), } } ) updates.append(s) content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode("utf-8") response = self._es.cluster.post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"} ) if response.errors: Log.error( "could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)], )
class DefaultDomain(Domain): """ DOMAIN IS A LIST OF OBJECTS, EACH WITH A value PROPERTY """ __slots__ = ["NULL", "partitions", "map", "limit"] def __init__(self, **desc): Domain.__init__(self, **desc) self.NULL = Null self.partitions = DictList() self.map = dict() self.map[None] = self.NULL self.limit = desc.get('limit') def compare(self, a, b): return value_compare(a.value, b.value) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getPartByKey(self, key): canonical = self.map.get(key) if canonical: return canonical canonical = Dict(name=key, value=key) self.partitions.append(canonical) self.map[key] = canonical return canonical # def getIndexByKey(self, key): # return self.map.get(key).dataIndex; def getKey(self, part): return part.value def getEnd(self, part): return part.value def getLabel(self, part): return part.value def as_dict(self): output = Domain.as_dict(self) output.partitions = self.partitions output.limit = self.limit return output
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"field": s, "sort": 1}) else: output.append({"field": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1)}) return wrap(output)
def _getAllEdges(facetEdges, edgeDepth): """ RETURN ALL PARTITION COMBINATIONS: A LIST OF ORDERED TUPLES """ if edgeDepth == len(facetEdges): return [()] edge = facetEdges[edgeDepth] deeper = _getAllEdges(facetEdges, edgeDepth + 1) output = DictList() partitions = edge.domain.partitions for part in partitions: for deep in deeper: output.append((part,) + deep) return output
def _getAllEdges(facetEdges, edgeDepth): """ RETURN ALL PARTITION COMBINATIONS: A LIST OF ORDERED TUPLES """ if edgeDepth == len(facetEdges): return [()] edge = facetEdges[edgeDepth] deeper = _getAllEdges(facetEdges, edgeDepth + 1) output = DictList() partitions = edge.domain.partitions for part in partitions: for deep in deeper: output.append((part, ) + deep) return output
def _iter(): g = 0 out = DictList() try: for i, d in enumerate(data): out.append(d) if (i + 1) % max_size == 0: yield g, out g += 1 out = DictList() if out: yield g, out except Exception, e: if out: # AT LEAST TRY TO RETURN WHAT HAS BEEN PROCESSED SO FAR yield g, out Log.error("Problem inside qb.groupby", e)
def _tuple(template, data, fields, depth, output): deep_path = None deep_fields = DictList() for d in data: record = template for f in fields: index, children, record = _tuple_deep(d, f, depth, record) if index: path = f.value[0:index:] deep_fields.append(f) if deep_path and path != deep_path: Log.error("Dangerous to select into more than one branch at time") if not children: output.append(record) else: _tuple(record, children, deep_fields, depth + 1, output) return output
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) # GET IDS OF DOCUMENTS results = self._es.search({ "fields": [], "query": { "filtered": { "query": { "match_all": {} }, "filter": _normalize_where(command.where, self) } }, "size": 200000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = DictList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") scripts.append("ctx._source." + k + " = " + expressions.qb_expression_to_ruby(v) + ";\n") script = "".join(scripts) if results.hits.hits: command = [] for id in results.hits.hits._id: command.append({"update": {"_id": id}}) command.append({"script": script}) content = ("\n".join(convert.value2json(c) for c in command) + "\n").encode('utf-8') self._es.cluster._post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"})
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if isinstance(where, Mapping): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception, e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = DictList() for k, v in where.terms.items(): if not isinstance(v, (list, set)): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if isinstance(edge, basestring): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception, e: Log.error("programmer error", e) fields = domain.dimension.fields if isinstance(fields, Mapping): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif isinstance(fields, list) and len(fields) == 1 and is_keyword( fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({ "or": [domain.getPartByKey(vv).esfilter for vv in v] }) return {"and": output}
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if sort==None: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring): output.append({"value": jx_expression(s), "sort": 1}) elif isinstance(s, Expression): output.append({"value": s, "sort": 1}) elif Math.is_integer(s): output.append({"value": OffsetOp("offset", s), "sort": 1}) elif all(d in sort_direction for d in s.values()) and not s.sort and not s.value: for v, d in s.items(): output.append({"value": jx_expression(v), "sort": -1}) else: output.append({"value": jx_expression(coalesce(s.value, s.field)), "sort": coalesce(sort_direction[s.sort], 1)}) return output
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if isinstance(where, Mapping): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception, e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = DictList() for k, v in where.terms.items(): if not isinstance(v, (list, set)): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if isinstance(edge, basestring): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception, e: Log.error("programmer error", e) fields = domain.dimension.fields if isinstance(fields, Mapping): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif isinstance(fields, list) and len(fields) == 1 and is_keyword(fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({"or": [domain.getPartByKey(vv).esfilter for vv in v]}) return {"and": output}
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ return list of (keys, values) pairs where group by the set of keys values IS LIST OF ALL data that has those keys contiguous - MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES """ if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) if isinstance(data, Cube): return data.groupby(keys) keys = listwrap(keys) def get_keys(d): output = Dict() for k in keys: output[k] = d[k] return output if contiguous: try: if not data: return wrap([]) agg = DictList() acc = DictList() curr_key = value2key(keys, data[0]) for d in data: key = value2key(keys, d) if key != curr_key: agg.append((get_keys(acc[0]), acc)) curr_key = key acc = [d] else: acc.append(d) agg.append((get_keys(acc[0]), acc)) return wrap(agg) except Exception, e: Log.error("Problem grouping contiguous values", e)
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"value": s, "sort": 1}) elif list(set(s.values()))[0] == "desc" and not s.sort and not s.value: for v, d in s.items(): output.append({"value": v, "sort": -1}) else: output.append({"value": coalesce(s.value, s.field), "sort": coalesce(sort_direction[s.sort], 1)}) return wrap(output)
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ return list of (keys, values) pairs where group by the set of keys values IS LIST OF ALL data that has those keys contiguous - MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES """ if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) if isinstance(data, Cube): return data.groupby(keys) if not isinstance(keys, (tuple, list)): keys = (keys,) def get_keys(d): output = Dict() for k in keys: output[k] = d[k] return output if contiguous: try: if not data: return wrap([]) agg = DictList() acc = DictList() curr_key = value2key(keys, data[0]) for d in data: key = value2key(keys, d) if key != curr_key: agg.append((get_keys(acc[0]), acc)) curr_key = key acc = [d] else: acc.append(d) agg.append((get_keys(acc[0]), acc)) return wrap(agg) except Exception, e: Log.error("Problem grouping contiguous values", e)
def normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"value": s, "sort": 1}) elif not s.field and not s.value and s.sort==None: #ASSUME {name: sort} FORM for n, v in s.items(): output.append({"value": n, "sort": sort_direction[v]}) else: output.append({"value": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1)}) return wrap(output)
def normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"value": s, "sort": 1}) elif not s.field and not s.value and s.sort == None: #ASSUME {name: sort} FORM for n, v in s.items(): output.append({"value": n, "sort": sort_direction[v]}) else: output.append({ "value": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1) }) return wrap(output)
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self._es.get_schema() # GET IDS OF DOCUMENTS results = self._es.search({ "fields": listwrap(schema._routing.path), "query": { "filtered": { "query": { "match_all": {} }, "filter": jx_expression(command.where).to_esfilter() } }, "size": 200000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = DictList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: scripts.append({ "script": "ctx._source." + k + " = " + jx_expression(v).to_ruby() }) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append({ "update": { "_id": h._id, "_routing": unwraplist(h.fields[literal_field( schema._routing.path)]) } }) updates.append(s) content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode('utf-8') response = self._es.cluster.post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"}) if response.errors: Log.error("could not update: {{error}}", error=[ e.error for i in response["items"] for e in i.values() if e.status not in (200, 201) ])
def pe_filter(filter, data, depth): """ PARTIAL EVALUATE THE filter BASED ON data GIVEN """ if filter is TRUE_FILTER: return True if filter is FALSE_FILTER: return False filter = wrap(filter) if filter["and"]: result = True output = DictList() for a in filter["and"]: f = pe_filter(a, data, depth) if f is False: result = False elif f is not True: output.append(f) if result and output: return {"and": output} else: return result elif filter["or"]: output = DictList() for o in filter["or"]: f = pe_filter(o, data, depth) if f is True: return True elif f is not False: output.append(f) if output: return {"or": output} else: return False elif filter["not"]: f = pe_filter(filter["not"], data, depth) if f is True: return False elif f is False: return True else: return {"not": f} elif filter.term or filter.eq: eq = coalesce(filter.term, filter.eq) result = True output = {} for col, val in eq.items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d != val: result = False else: output[rest] = val if result and output: return {"term": output} else: return result elif filter.equal: a, b = filter["equal"] first_a, rest_a = parse_field(a, data, depth) first_b, rest_b = parse_field(b, data, depth) val_a = data[first_a] val_b = data[first_b] if not rest_a: if not rest_b: if val_a != val_b: return False else: return True else: return {"term": {rest_b: val_a}} else: if not rest_b: return {"term": {rest_a: val_b}} else: return {"equal": [rest_a, rest_b]} elif filter.terms: result = True output = {} for col, vals in filter["terms"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d not in vals: result = False else: output[rest] = vals if result and output: return {"terms": output} else: return result elif filter.range: result = True output = {} for col, ranges in filter["range"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: for sign, val in ranges.items(): if sign in ("gt", ">") and d <= val: result = False if sign == "gte" and d < val: result = False if sign == "lte" and d > val: result = False if sign == "lt" and d >= val: result = False else: output[rest] = ranges if result and output: return {"range": output} else: return result elif filter.missing: if isinstance(filter.missing, basestring): field = filter["missing"] else: field = filter["missing"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d == None: return True return False else: return {"missing": rest} elif filter.prefix: result = True output = {} for col, val in filter["prefix"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d == None or not d.startswith(val): result = False else: output[rest] = val if result and output: return {"prefix": output} else: return result elif filter.exists: if isinstance(filter["exists"], basestring): field = filter["exists"] else: field = filter["exists"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d != None: return True return False else: return {"exists": rest} else: Log.error("Can not interpret esfilter: {{esfilter}}", {"esfilter": filter})
class SetDomain(Domain): __slots__ = ["NULL", "partitions", "map", "order"] def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = DictList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], basestring): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = ("value", ) self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len( desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if isinstance(desc.partitions, list): self.partitions = desc.partitions.copy() else: Log.error("expecting a list of partitions") def compare(self, a, b): return value_compare(self.getKey(a), self.getKey(b)) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getIndexByKey(self, key): try: output = self.order.get(key) if output is None: return len(self.partitions) return output except Exception, e: Log.error("problem", e)
def __init__(self, dim, parent, qb): self.name = dim.name self.parent = parent self.full_name = join_field( split_field(self.parent.full_name) + [self.name]) dot.set_default(self, dim) self.esfilter = dim.esfilter self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, qb.es.settings.name) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Dict() for e in listwrap(dim.edges): new_e = Dimension(e, self, qb) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif isinstance(fields, Mapping): self.fields = wrap(fields) edges = wrap([{ "name": k, "value": v, "allowNulls": False } for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{ "name": f, "value": f, "index": i, "allowNulls": False } for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if dim.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH with Timer("Get parts of {{name}}", {"name": self.name}): parts = qb.query({ "from": self.index, "select": { "name": "count", "aggregate": "count" }, "edges": edges, "esfilter": self.esfilter, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name=self.name, num=len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Dict(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not isinstance(a, list): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts(temp, dim.path(d.getEnd(d.partitions[i])), count, 0) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif isinstance(fields, Mapping): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = DictList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "esfilter": { "and": [{ "term": { e.value: g[e.name] } } for e in edges] }, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "esfilter": { "term": { edges[0].value: d.partitions[i].value } }, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values( )[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if isinstance(fields, Mapping): output = Dict() for e, v in zip(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "esfilter": { "term": { edges[0].value: d.partitions[i].value } }, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "esfilter": { "and": [{ "term": { edges[0].value: d.partitions[i].value } }, { "term": { edges[1].value: d2.partitions[j].value } }] }, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
class SetDomain(Domain): __slots__ = ["NULL", "partitions", "map", "order"] def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = DictList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], (int, float, basestring)): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") def compare(self, a, b): return value_compare(self.getKey(a), self.getKey(b)) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getIndexByKey(self, key): try: output = self.order.get(key) if output is None: return len(self.partitions) return output except Exception, e: Log.error("problem", e)
def __init__(self, dim, parent, qb): dim = wrap(dim) self.name = dim.name self.parent = coalesce(parent) self.full_name = join_field(split_field(self.parent.full_name)+[self.name]) self.edges = None # FOR NOW dot.set_default(self, dim) self.where = dim.where self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, qb.settings.index) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Dict() for e in listwrap(dim.edges): new_e = Dimension(e, self, qb) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif isinstance(fields, Mapping): self.fields = wrap(fields) edges = wrap([{"name": k, "value": v, "allowNulls": False} for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{"name": f, "value": f, "index": i, "allowNulls": False} for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if self.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH qb.get_columns() with Timer("Get parts of {{name}}", {"name": self.name}): parts = qb.query({ "from": self.index, "select": {"name": "count", "aggregate": "count"}, "edges": edges, "where": self.where, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name= self.name, num= len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Dict(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not isinstance(a, list): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts( temp, dim.path(d.getEnd(d.partitions[i])), count, 0 ) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif isinstance(fields, Mapping): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = DictList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "where": {"and": [ {"term": {e.value: g[e.name]}} for e in edges ]}, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values()[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if isinstance(fields, Mapping): output = Dict() for e, v in zip(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "where": {"and": [ {"term": {edges[0].value: d.partitions[i].value}}, {"term": {edges[1].value: d2.partitions[j].value}} ]}, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
def decode(json): """ THIS IS CURRENTLY 50% SLOWER THAN PyPy DEFAULT IMPLEMENTATION THE INTENT IS TO NEVER ACTUALLY PARSE ARRAYS OF PRIMITIVE VALUES, RATHER FIND THE START AND END OF THOSE ARRAYS AND SIMPLY STRING COPY THEM TO THE INEVITABLE JSON OUTPUT """ var = "" curr = DictList() mode = ARRAY stack = DictList() # FIRST PASS SIMPLY GETS STRUCTURE i = 0 while i < len(json): c = json[i] i += 1 if mode == ARRAY: if c in [" ", "\t", "\n", "\r", ","]: pass elif c == "]": curr = stack.pop() if isinstance(curr, Mapping): mode = OBJECT else: mode = ARRAY elif c == "[": i, arr = jump_array(i, json) if arr is None: arr = [] stack.append(curr) curr.append(arr) curr = arr mode = ARRAY else: curr.append(arr) elif c == "{": obj = {} stack.append(curr) curr.append(obj) curr = obj mode = OBJECT elif c == "\"": i, val = fast_parse_string(i, json) curr.children.append(val) else: i, val = parse_const(i, json) elif mode == OBJECT: if c in [" ", "\t", "\n", "\r", ","]: pass elif c == ":": mode = VALUE elif c == "}": curr = stack.pop() if isinstance(curr, Mapping): mode = OBJECT else: mode = ARRAY elif c == "\"": i, var = fast_parse_string(i, json) elif mode == VALUE: if c in [" ", "\t", "\n", "\r"]: pass elif c == "}": curr = stack.pop() if isinstance(curr, Mapping): mode = OBJECT else: mode = ARRAY elif c == "[": i, arr = jump_array(i, json) if arr is None: arr = [] stack.append(curr) curr[var] = arr curr = arr mode = ARRAY else: curr[var] = arr mode = OBJECT elif c == "{": obj = {} stack.append(curr) curr[var] = obj curr = obj mode = OBJECT elif c == "\"": i, val = fast_parse_string(i, json) curr[var] = val mode = OBJECT else: i, val = parse_const(i, json) curr[var] = val mode = OBJECT return curr[0]
def _setop(self, query): """ NO AGGREGATION, SIMPLE LIST COMPREHENSION """ if isinstance(query.select, list): # RETURN BORING RESULT SET selects = DictList() for s in listwrap(query.select): if isinstance(s.value, Mapping): for k, v in s.value.items: selects.append(v + " AS " + self.db.quote_column(s.name + "." + k)) if isinstance(s.value, list): for i, ss in enumerate(s.value): selects.append(s.value + " AS " + self.db.quote_column(s.name + "," + str(i))) else: selects.append(s.value + " AS " + self.db.quote_column(s.name)) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} {{sort}} {{limit}} """, { "selects": SQL(",\n".join(selects)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where), "limit": self._limit2sql(query.limit), "sort": self._sort2sql(query.sort) }) def post_process(sql): result = self.db.query(sql) for s in listwrap(query.select): if isinstance(s.value, Mapping): for r in result: r[s.name] = {} for k, v in s.value: r[s.name][k] = r[s.name + "." + k] r[s.name + "." + k] = None if isinstance(s.value, list): # REWRITE AS TUPLE for r in result: r[s.name] = tuple(r[s.name + "," + str(i)] for i, ss in enumerate(s.value)) for i, ss in enumerate(s.value): r[s.name + "," + str(i)] = None expand_json(result) return result return sql, post_process # RETURN BORING RESULT SET else: # RETURN LIST OF VALUES if query.select.value == ".": select = "*" else: name = query.select.name select = query.select.value + " AS " + self.db.quote_column(name) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} {{sort}} {{limit}} """, { "selects": SQL(select), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where), "limit": self._limit2sql(query.limit), "sort": self._sort2sql(query.sort) }) if query.select.value == ".": def post(sql): result = self.db.query(sql) expand_json(result) return result return sql, post else: return sql, lambda sql: [r[name] for r in self.db.query(sql)] # RETURNING LIST OF VALUES
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = DictList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error( "There is more than one open-ended edge: self can not be handled" ) specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_keyword(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges) * len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": { "aggregate": "count" }, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note( "{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count=len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, DictList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error( "not implemented yet" ) # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = DictList() constants = DictList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({ "name": fedge.domain.name, "value": parts[f] }) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_keyword(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_keyword(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter( {"and": condition}) data = es09.util.post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = DictList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index] + ( special.dataIndex, ) + pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def _map_term_using_schema(master, path, term, schema_edges): """ IF THE WHERE CLAUSE REFERS TO FIELDS IN THE SCHEMA, THEN EXPAND THEM """ output = DictList() for k, v in term.items(): dimension = schema_edges[k] if isinstance(dimension, Dimension): domain = dimension.getDomain() if dimension.fields: if isinstance(dimension.fields, Mapping): # EXPECTING A TUPLE for local_field, es_field in dimension.fields.items(): local_value = v[local_field] if local_value == None: output.append({"missing": {"field": es_field}}) else: output.append({"term": {es_field: local_value}}) continue if len(dimension.fields) == 1 and is_keyword( dimension.fields[0]): # SIMPLE SINGLE-VALUED FIELD if domain.getPartByKey(v) is domain.NULL: output.append( {"missing": { "field": dimension.fields[0] }}) else: output.append({"term": {dimension.fields[0]: v}}) continue if AND(is_keyword(f) for f in dimension.fields): # EXPECTING A TUPLE if not isinstance(v, tuple): Log.error("expecing {{name}}={{value}} to be a tuple", name=k, value=v) for i, f in enumerate(dimension.fields): vv = v[i] if vv == None: output.append({"missing": {"field": f}}) else: output.append({"term": {f: vv}}) continue if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]): if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if domain.partitions: part = domain.getPartByKey(v) if part is domain.NULL or not part.esfilter: Log.error("not expected to get NULL") output.append(part.esfilter) continue else: Log.error("not expected") elif isinstance(v, Mapping): sub = _map_term_using_schema(master, path + [k], v, schema_edges[k]) output.append(sub) continue output.append({"term": {k: v}}) return {"and": output}
def _grouped(self, query, stacked=False): select = listwrap(query.select) # RETURN SINGLE OBJECT WITH AGGREGATES for s in select: if s.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s) selects = DictList() groups = DictList() edges = query.edges for e in edges: if e.domain.type != "default": Log.error("domain of type {{type}} not supported, yet", type=e.domain.type) groups.append(e.value) selects.append(e.value + " AS " + self.db.quote_column(e.name)) for s in select: selects.append(aggregates[s.aggregate].replace("{{code}}", s.value) + " AS " + self.db.quote_column(s.name)) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} GROUP BY {{groups}} """, { "selects": SQL(",\n".join(selects)), "groups": SQL(",\n".join(groups)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where) }) def post_stacked(sql): # RETURN IN THE USUAL DATABASE RESULT SET FORMAT return self.db.query(sql) def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{"index": i, "value": p} for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = DictList() for c, s in enumerate(select): data = Matrix(*[len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0] return sql, post if not stacked else post_stacked
def transform(self, uid, talos_test_result): try: r = talos_test_result def mainthread_transform(r): if r == None: return None output = Dict() for i in r.mainthread_readbytes: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].readbytes = i[0] r.mainthread_readbytes = None for i in r.mainthread_writebytes: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].writebytes = i[0] r.mainthread_writebytes = None for i in r.mainthread_readcount: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].readcount = i[0] r.mainthread_readcount = None for i in r.mainthread_writecount: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].writecount = i[0] r.mainthread_writecount = None r.mainthread = output.values() mainthread_transform(r.results_aux) mainthread_transform(r.results_xperf) branch = r.build.branch if branch.lower().endswith("-non-pgo"): branch = branch[0:-8] r.build.branch = branch r.build.pgo = False else: r.build.pgo = True if r.machine.osversion.endswith(".e"): r.machine.osversion = r.machine.osversion[:-2] r.machine.e10s = True #ADD PUSH LOG INFO try: with Profiler("get from pushlog"): revision = Revision( **{ "branch": { "name": branch }, "changeset": { "id": r.build.revision } }) with self.locker: revision = self.repo.get_revision(revision) with self.locker: push = self.repo.get_push(revision) r.build.push_date = push.date except Exception, e: Log.warning( "{{build.branch}} @ {{build.revision}} (perf_id=={{treeherder.perf_id}}) has no pushlog", r, e) # TRY AGAIN LATER return [] new_records = [] # RECORD THE UNKNOWN PART OF THE TEST RESULTS remainder = r.copy() remainder.results = None if not r.results or len(remainder.keys()) > 4: new_records.append(remainder) #RECORD TEST RESULTS total = DictList() if r.run.suite in ["dromaeo_css", "dromaeo_dom"]: #dromaeo IS SPECIAL, REPLICATES ARE IN SETS OF FIVE #RECORD ALL RESULTS for i, (test_name, replicates) in enumerate(r.results.items()): for g, sub_results in qb.groupby(replicates, size=5): new_record = Dict(machine=r.machine, treeherder=r.treeherder, run=r.run, build=r.build, result={ "test_name": unicode(test_name) + "." + unicode(g), "ordering": i, "samples": sub_results }) try: s = stats(sub_results) new_record.result.stats = s total.append(s) except Exception, e: Log.warning("can not reduce series to moments", e) new_records.append(new_record)
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) select = wrap([s.copy() for s in listwrap(query.select)]) new_select = DictList() columns = query.frum.get_columns() leaf_columns = set(c.name for c in columns if c.type not in STRUCT and (not c.nested_path or c.es_column == c.nested_path)) nested_columns = set(c.name for c in columns if c.nested_path) i = 0 source = "fields" for s in select: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(s.value, LeavesOp): term = s.value.term if isinstance(term, Variable): if term.var == ".": es_query.fields = None source = "_source" net_columns = leaf_columns - set(select.name) for n in net_columns: new_select.append({ "name": n, "value": Variable(n), "put": {"name": n, "index": i, "child": "."} }) i += 1 else: parent = term.var + "." prefix = len(parent) for c in leaf_columns: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": s.name + "." + c[prefix:], "value": Variable(c), "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var in nested_columns: es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 else: parent = s.value.var + "." prefix = len(parent) net_columns = [c for c in leaf_columns if c.startswith(parent)] if not net_columns: # LEAF if es_query.fields is not None: es_query.fields.append(s.value.var) new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) else: # LEAVES OF OBJECT for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": s.name, "value": Variable(n), "put": {"name": s.name, "index": i, "child": n[prefix:]} }) i += 1 else: es_query.script_fields[literal_field(s.name)] = {"script": s.value.to_ruby()} new_select.append({ "name": s.name, "pull": "fields." + literal_field(s.name), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value.var)) elif isinstance(n.value, Variable): n.pull = "fields." + literal_field(n.value.var) else: Log.error("Do not know what to do") with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = DictList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error("There is more than one open-ended edge: self can not be handled") specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_keyword(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges)*len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": {"aggregate": "count"}, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note("{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count= len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, DictList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error("not implemented yet") # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = DictList() constants = DictList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({"name": fedge.domain.name, "value": parts[f]}) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_keyword(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_keyword(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter({"and": condition}) data = es09.util.post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = DictList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index]+(special.dataIndex, )+pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def _map_term_using_schema(master, path, term, schema_edges): """ IF THE WHERE CLAUSE REFERS TO FIELDS IN THE SCHEMA, THEN EXPAND THEM """ output = DictList() for k, v in term.items(): dimension = schema_edges[k] if isinstance(dimension, Dimension): domain = dimension.getDomain() if dimension.fields: if isinstance(dimension.fields, Mapping): # EXPECTING A TUPLE for local_field, es_field in dimension.fields.items(): local_value = v[local_field] if local_value == None: output.append({"missing": {"field": es_field}}) else: output.append({"term": {es_field: local_value}}) continue if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]): # SIMPLE SINGLE-VALUED FIELD if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if AND(is_keyword(f) for f in dimension.fields): # EXPECTING A TUPLE if not isinstance(v, tuple): Log.error("expecing {{name}}={{value}} to be a tuple", name= k, value= v) for i, f in enumerate(dimension.fields): vv = v[i] if vv == None: output.append({"missing": {"field": f}}) else: output.append({"term": {f: vv}}) continue if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]): if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if domain.partitions: part = domain.getPartByKey(v) if part is domain.NULL or not part.esfilter: Log.error("not expected to get NULL") output.append(part.esfilter) continue else: Log.error("not expected") elif isinstance(v, Mapping): sub = _map_term_using_schema(master, path + [k], v, schema_edges[k]) output.append(sub) continue output.append({"term": {k: v}}) return {"and": output}
def parse_columns(parent_path, esProperties): """ RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT """ columns = DictList() for name, property in esProperties.items(): if parent_path: path = join_field(split_field(parent_path) + [name]) else: path = name if property.type == "nested" and property.properties: # NESTED TYPE IS A NEW TYPE DEFINITION # MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH child_columns = deepcopy(parse_columns(path, property.properties)) self_columns = deepcopy(child_columns) for c in self_columns: c.depth += 1 columns.extend(self_columns) columns.append({ "name": join_field(split_field(path)[1::]), "type": "nested", "useSource": False }) if path not in INDEX_CACHE: pp = split_field(parent_path) for i in qb.reverse(range(len(pp))): c = INDEX_CACHE.get(join_field(pp[:i + 1]), None) if c: INDEX_CACHE[path] = c.copy() break else: Log.error("Can not find parent") INDEX_CACHE[path].name = path INDEX_CACHE[path].columns = child_columns continue if property.properties: child_columns = parse_columns(path, property.properties) columns.extend(child_columns) columns.append({ "name": join_field(split_field(path)[1::]), "type": "object", "useSource": False }) if property.dynamic: continue if not property.type: continue if property.type == "multi_field": property.type = property.fields[name].type # PULL DEFAULT TYPE for i, n, p in enumerate(property.fields): if n == name: # DEFAULT columns.append({ "name": join_field(split_field(path)[1::]), "type": p.type, "useSource": p.index == "no" }) else: columns.append({ "name": join_field(split_field(path)[1::]) + "\\." + n, "type": p.type, "useSource": p.index == "no" }) continue if property.type in [ "string", "boolean", "integer", "date", "long", "double" ]: columns.append({ "name": join_field(split_field(path)[1::]), "type": property.type, "useSource": property.index == "no" }) if property.index_name and name != property.index_name: columns.append({ "name": property.index_name, "type": property.type, "useSource": property.index == "no" }) elif property.enabled == None or property.enabled == False: columns.append({ "name": join_field(split_field(path)[1::]), "type": "object", "useSource": True }) else: Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=path) return columns
def _grouped(self, query, stacked=False): select = listwrap(query.select) # RETURN SINGLE OBJECT WITH AGGREGATES for s in select: if s.aggregate not in aggregates: Log.error( "Expecting all columns to have an aggregate: {{select}}", select=s) selects = DictList() groups = DictList() edges = query.edges for e in edges: if e.domain.type != "default": Log.error("domain of type {{type}} not supported, yet", type=e.domain.type) groups.append(e.value) selects.append(e.value + " AS " + self.db.quote_column(e.name)) for s in select: selects.append( aggregates[s.aggregate].replace("{{code}}", s.value) + " AS " + self.db.quote_column(s.name)) sql = expand_template( """ SELECT {{selects}} FROM {{table}} {{where}} GROUP BY {{groups}} """, { "selects": SQL(",\n".join(selects)), "groups": SQL(",\n".join(groups)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where) }) def post_stacked(sql): # RETURN IN THE USUAL DATABASE RESULT SET FORMAT return self.db.query(sql) def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{ "index": i, "value": p } for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = DictList() for c, s in enumerate(select): data = Matrix(*[ len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges ]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0] return sql, post if not stacked else post_stacked
def pe_filter(filter, data, depth): """ PARTIAL EVALUATE THE filter BASED ON data GIVEN """ if filter is TRUE_FILTER: return True if filter is FALSE_FILTER: return False filter = wrap(filter) if filter["and"]: result = True output = DictList() for a in filter[u"and"]: f = pe_filter(a, data, depth) if f is False: result = False elif f is not True: output.append(f) if result and output: return {"and": output} else: return result elif filter["or"]: output = DictList() for o in filter[u"or"]: f = pe_filter(o, data, depth) if f is True: return True elif f is not False: output.append(f) if output: return {"or": output} else: return False elif filter["not"]: f = pe_filter(filter["not"], data, depth) if f is True: return False elif f is False: return True else: return {"not": f} elif filter.term or filter.eq: eq = coalesce(filter.term, filter.eq) result = True output = {} for col, val in eq.items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d != val: result = False else: output[rest] = val if result and output: return {"term": output} else: return result elif filter.equal: a, b = filter["equal"] first_a, rest_a = parse_field(a, data, depth) first_b, rest_b = parse_field(b, data, depth) val_a = data[first_a] val_b = data[first_b] if not rest_a: if not rest_b: if val_a != val_b: return False else: return True else: return {"term": {rest_b: val_a}} else: if not rest_b: return {"term": {rest_a: val_b}} else: return {"equal": [rest_a, rest_b]} elif filter.terms: result = True output = {} for col, vals in filter["terms"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d not in vals: result = False else: output[rest] = vals if result and output: return {"terms": output} else: return result elif filter.range: result = True output = {} for col, ranges in filter["range"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: for sign, val in ranges.items(): if sign in ("gt", ">") and d <= val: result = False if sign == "gte" and d < val: result = False if sign == "lte" and d > val: result = False if sign == "lt" and d >= val: result = False else: output[rest] = ranges if result and output: return {"range": output} else: return result elif filter.missing: if isinstance(filter.missing, basestring): field = filter["missing"] else: field = filter["missing"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d == None: return True return False else: return {"missing": rest} elif filter.prefix: result = True output = {} for col, val in filter["prefix"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d == None or not d.startswith(val): result = False else: output[rest] = val if result and output: return {"prefix": output} else: return result elif filter.exists: if isinstance(filter["exists"], basestring): field = filter["exists"] else: field = filter["exists"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d != None: return True return False else: return {"exists": rest} else: Log.error(u"Can not interpret esfilter: {{esfilter}}", {u"esfilter": filter})
def parse_properties(parent_index_name, parent_query_path, esProperties): """ RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT """ from pyLibrary.queries.meta import Column columns = DictList() for name, property in esProperties.items(): if parent_query_path: index_name, query_path = parent_index_name, join_field(split_field(parent_query_path) + [name]) else: index_name, query_path = parent_index_name, name if property.type == "nested" and property.properties: # NESTED TYPE IS A NEW TYPE DEFINITION # MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH self_columns = parse_properties(index_name, query_path, property.properties) for c in self_columns: c.nested_path = unwraplist([query_path] + listwrap(c.nested_path)) columns.extend(self_columns) columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type="nested", nested_path=query_path )) continue if property.properties: child_columns = parse_properties(index_name, query_path, property.properties) columns.extend(child_columns) columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type="source" if property.enabled == False else "object" )) if property.dynamic: continue if not property.type: continue if property.type == "multi_field": property.type = property.fields[name].type # PULL DEFAULT TYPE for i, (n, p) in enumerate(property.fields.items()): if n == name: # DEFAULT columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type=p.type )) else: columns.append(Column( table=index_name, es_index=index_name, name=query_path + "\\." + n, es_column=query_path + "\\." + n, type=p.type )) continue if property.type in ["string", "boolean", "integer", "date", "long", "double"]: columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type=property.type )) if property.index_name and name != property.index_name: columns.append(Column( table=index_name, es_index=index_name, es_column=query_path, name=query_path, type=property.type )) elif property.enabled == None or property.enabled == False: columns.append(Column( table=index_name, es_index=index_name, name=query_path, es_column=query_path, type="source" if property.enabled==False else "object" )) else: Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=query_path) return columns
def select(self, fields): if isinstance(fields, Mapping): fields = fields.value if isinstance(fields, basestring): # RETURN LIST OF VALUES if len(split_field(fields)) == 1: if self.path[0] == fields: return [d[1] for d in self.data] else: return [d[0][fields] for d in self.data] else: keys = split_field(fields) depth = coalesce( MIN([ i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p ]), len(self.path)) # LENGTH OF COMMON PREFIX short_key = keys[depth:] output = DictList() _select1((wrap(d[depth]) for d in self.data), short_key, 0, output) return output if isinstance(fields, list): output = DictList() meta = [] for f in fields: if hasattr(f.value, "__call__"): meta.append((f.name, f.value)) else: meta.append( (f.name, functools.partial(lambda v, d: d[v], f.value))) for row in self._values(): agg = Dict() for name, f in meta: agg[name] = f(row) output.append(agg) return output # meta = [] # for f in fields: # keys = split_field(f.value) # depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX # short_key = join_field(keys[depth:]) # # meta.append((f.name, depth, short_key)) # # for row in self._data: # agg = Dict() # for name, depth, short_key in meta: # if short_key: # agg[name] = row[depth][short_key] # else: # agg[name] = row[depth] # output.append(agg) # return output Log.error("multiselect over FlatList not supported")
def _setop(self, query): """ NO AGGREGATION, SIMPLE LIST COMPREHENSION """ if isinstance(query.select, list): # RETURN BORING RESULT SET selects = DictList() for s in listwrap(query.select): if isinstance(s.value, Mapping): for k, v in s.value.items: selects.append(v + " AS " + self.db.quote_column(s.name + "." + k)) if isinstance(s.value, list): for i, ss in enumerate(s.value): selects.append(s.value + " AS " + self.db.quote_column(s.name + "," + str(i))) else: selects.append(s.value + " AS " + self.db.quote_column(s.name)) sql = expand_template( """ SELECT {{selects}} FROM {{table}} {{where}} {{sort}} {{limit}} """, { "selects": SQL(",\n".join(selects)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where), "limit": self._limit2sql(query.limit), "sort": self._sort2sql(query.sort) }) def post_process(sql): result = self.db.query(sql) for s in listwrap(query.select): if isinstance(s.value, Mapping): for r in result: r[s.name] = {} for k, v in s.value: r[s.name][k] = r[s.name + "." + k] r[s.name + "." + k] = None if isinstance(s.value, list): # REWRITE AS TUPLE for r in result: r[s.name] = tuple(r[s.name + "," + str(i)] for i, ss in enumerate(s.value)) for i, ss in enumerate(s.value): r[s.name + "," + str(i)] = None expand_json(result) return result return sql, post_process # RETURN BORING RESULT SET else: # RETURN LIST OF VALUES if query.select.value == ".": select = "*" else: name = query.select.name select = query.select.value + " AS " + self.db.quote_column( name) sql = expand_template( """ SELECT {{selects}} FROM {{table}} {{where}} {{sort}} {{limit}} """, { "selects": SQL(select), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where), "limit": self._limit2sql(query.limit), "sort": self._sort2sql(query.sort) }) if query.select.value == ".": def post(sql): result = self.db.query(sql) expand_json(result) return result return sql, post else: return sql, lambda sql: [r[name] for r in self.db.query(sql) ] # RETURNING LIST OF VALUES
class SimpleSetDomain(Domain): """ DOMAIN IS A LIST OF OBJECTS, EACH WITH A value PROPERTY """ __slots__ = ["NULL", "partitions", "map", "order"] def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = DictList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and (len(desc.partitions)==0 or isinstance(desc.partitions[0], (basestring, Number, tuple))): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len(desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and all(desc.partitions.where) or all(desc.partitions.esfilter): if not all(desc.partitions.name): Log.error("Expecting all partitions to have a name") from pyLibrary.queries.expressions import jx_expression self.key = "name" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.partitions.append({ "where": jx_expression(coalesce(p.where, p.esfilter)), "name": p.name, "dataIndex": i }) self.map[p.name] = p self.order[p.name] = i return elif desc.partitions and len(set(desc.partitions.value)-{None}) == len(desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys, or partitions") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = wrap(list(desc.partitions)) else: Log.error("expecting a list of partitions") def compare(self, a, b): return value_compare(self.getKey(a), self.getKey(b)) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getIndexByKey(self, key): try: output = self.order.get(key) if output is None: return len(self.partitions) return output except Exception, e: Log.error("problem", e)
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce( kwargs.depth, len(self.fields) - 1 if isinstance(self.fields, list) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "esfilter": v.esfilter, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.esfilter ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = DictList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name": part.name, "value": part.value, "esfilter": part.esfilter, "style": coalesce(part.style, part.parent.style), "weight": part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name": v.name, "value": v.value, "esfilter": v.esfilter, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) ] elif kwargs.depth == 1: partitions = DictList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name": join_field( split_field(subpart.parent.name) + [subpart.name]), "value": subpart.value, "esfilter": subpart.esfilter, "style": coalesce(subpart.style, subpart.parent.style), "weight": subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception, e: Log.error("", e)
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) select = wrap([s.copy() for s in listwrap(query.select)]) new_select = DictList() column_names = set(c.name for c in query.frum.get_columns() if c.type not in ["object"] and (not c.nested_path or c.abs_name == c.nested_path or not c.nested_path)) source = "fields" i = 0 for s in select: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if s.value == "*": es_query.fields = None source = "_source" net_columns = column_names - set(select.name) for n in net_columns: new_select.append({ "name": n, "value": n, "put": {"name": n, "index": i, "child": "."} }) i += 1 elif s.value == ".": es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value == "_id": new_select.append({ "name": s.name, "value": s.value, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and s.value.endswith(".*") and is_keyword(s.value[:-2]): parent = s.value[:-1] prefix = len(parent) for c in column_names: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": s.name + "." + c[prefix:], "value": c, "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and is_keyword(s.value): parent = s.value + "." prefix = len(parent) net_columns = [c for c in column_names if c.startswith(parent)] if not net_columns: if es_query.fields is not None: es_query.fields.append(s.value) new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) else: for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": s.name, "value": n, "put": {"name": s.name, "index": i, "child": n[prefix:]} }) i += 1 elif isinstance(s.value, list): Log.error("need an example") if es_query.fields is not None: es_query.fields.extend([v for v in s.value]) else: es_query.script_fields[literal_field(s.name)] = {"script": qb_expression(s.value).to_ruby()} new_select.append({ "name": s.name, "pull": "fields." + literal_field(s.name), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value)) else: n.pull = "fields." + literal_field(n.value) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.es_response_time = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) select = wrap([s.copy() for s in listwrap(query.select)]) new_select = DictList() columns = query.frum.get_columns() leaf_columns = set(c.name for c in columns if c.type not in ["object", "nested"] and (not c.nested_path or c.es_column == c.nested_path)) nested_columns = set(c.name for c in columns if c.nested_path) i = 0 source = "fields" for s in select: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(s.value, LeavesOp): if isinstance(s.value.term, Variable): if s.value.term.var == ".": es_query.fields = None source = "_source" net_columns = leaf_columns - set(select.name) for n in net_columns: new_select.append({ "name": n, "value": n, "put": {"name": n, "index": i, "child": "."} }) i += 1 else: parent = s.value.var + "." prefix = len(parent) for c in leaf_columns: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": s.name + "." + c[prefix:], "value": c, "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(s.value, Variable): if s.value.var == ".": es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value.var, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var == "_id": new_select.append({ "name": s.name, "value": s.value.var, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value.var in nested_columns: es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 else: parent = s.value.var + "." prefix = len(parent) net_columns = [c for c in leaf_columns if c.startswith(parent)] if not net_columns: # LEAF if es_query.fields is not None: es_query.fields.append(s.value.var) new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) else: # LEAVES OF OBJECT for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": s.name, "value": n, "put": {"name": s.name, "index": i, "child": n[prefix:]} }) i += 1 else: es_query.script_fields[literal_field(s.name)] = {"script": s.value.to_ruby()} new_select.append({ "name": s.name, "pull": "fields." + literal_field(s.name), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value)) else: n.pull = "fields." + literal_field(n.value) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
class SimpleSetDomain(Domain): """ DOMAIN IS A LIST OF OBJECTS, EACH WITH A value PROPERTY """ __slots__ = ["NULL", "partitions", "map", "order"] def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = DictList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and isinstance(desc.partitions[0], (basestring, Number)): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len( desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and len(set(desc.partitions.value)) == len( desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = list(desc.partitions) else: Log.error("expecting a list of partitions") def compare(self, a, b): return value_compare(self.getKey(a), self.getKey(b)) def getCanonicalPart(self, part): return self.getPartByKey(part.value) def getIndexByKey(self, key): try: output = self.order.get(key) if output is None: return len(self.partitions) return output except Exception, e: Log.error("problem", e)