def _map_term_using_schema(master, path, term, schema_edges): """ IF THE WHERE CLAUSE REFERS TO FIELDS IN THE SCHEMA, THEN EXPAND THEM """ output = DictList() for k, v in term.items(): dimension = schema_edges[k] if isinstance(dimension, Dimension): domain = dimension.getDomain() if dimension.fields: if isinstance(dimension.fields, Mapping): # EXPECTING A TUPLE for local_field, es_field in dimension.fields.items(): local_value = v[local_field] if local_value == None: output.append({"missing": {"field": es_field}}) else: output.append({"term": {es_field: local_value}}) continue if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]): # SIMPLE SINGLE-VALUED FIELD if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if AND(is_keyword(f) for f in dimension.fields): # EXPECTING A TUPLE if not isinstance(v, tuple): Log.error("expecing {{name}}={{value}} to be a tuple", name= k, value= v) for i, f in enumerate(dimension.fields): vv = v[i] if vv == None: output.append({"missing": {"field": f}}) else: output.append({"term": {f: vv}}) continue if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]): if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if domain.partitions: part = domain.getPartByKey(v) if part is domain.NULL or not part.esfilter: Log.error("not expected to get NULL") output.append(part.esfilter) continue else: Log.error("not expected") elif isinstance(v, Mapping): sub = _map_term_using_schema(master, path + [k], v, schema_edges[k]) output.append(sub) continue output.append({"term": {k: v}}) return {"and": output}
def _range_composer(edge, domain, es_query, to_float): # USE RANGES _min = coalesce(domain.min, MAX(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if is_keyword(edge.value): calc = {"field": edge.value} else: calc = {"script": qb_expression_to_ruby(edge.value)} if is_keyword(edge.value): missing_range = {"or": [ {"range": {edge.value: {"lt": to_float(_min)}}}, {"range": {edge.value: {"gte": to_float(_max)}}} ]} else: missing_range = {"script": {"script": qb_expression_to_ruby({"or": [ {"lt": [edge.value, to_float(_min)]}, {"gt": [edge.value, to_float(_max)]}, ]})}} return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": set_default( {"filter": {"or": [ missing_range, {"missing": {"field": get_all_vars(edge.value)}} ]}}, es_query ), }})
def es_countop(es, mvel, query): """ RETURN SINGLE COUNT """ select = listwrap(query.select) FromES = build_es_query(query) for s in select: if is_keyword(s.value): FromES.facets[s.name] = { "terms": { "field": s.value, "size": query.limit, }, "facet_filter":{"exists":{"field":s.value}} } else: # COMPLICATED value IS PROBABLY A SCRIPT, USE IT FromES.facets[s.name] = { "terms": { "script_field": es09.expressions.compile_expression(s.value, query), "size": 200000 } } data = es09.util.post(es, FromES, query.limit) matricies = {} for s in select: matricies[s.name] = Matrix(value=data.hits.facets[s.name].total) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def es_setop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.fields = DictList() es_query.sort = qb_sort_to_es_sort(query.sort) source = "fields" for s in select: if s.value == "*": es_query.fields = None es_query.script_fields = None source = "_source" elif s.value == ".": es_query.fields = None es_query.script_fields = None source = "_source" elif isinstance(s.value, basestring) and is_keyword(s.value): es_query.fields.append(s.value) elif isinstance(s.value, list) and es_query.fields is not None: es_query.fields.extend(s.value) else: es_query.script_fields[literal_field(s.name)] = {"script": qb_expression_to_ruby(s.value)} return extract_rows(es, es_query, source, select, query)
def convert(self, expr): """ EXPAND INSTANCES OF name TO value """ if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_keyword(expr): return coalesce(self.dimensions[expr], expr) elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, QueryOp): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({ name: self.convert(value) for name, value in expr.leaves() }) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return converter_map.get(k, self._convert_bop)(self, k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr]) else: return expr
def is_fieldop(es, query): if not (es.cluster.version.startswith("1.4.") or es.cluster.version.startswith("1.5.")): return False # THESE SMOOTH EDGES REQUIRE ALL DATA (SETOP) select = listwrap(query.select) if not query.edges: isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isSimple = AND( s.value != None and (s.value in ["*", "."] or is_keyword(s.value)) for s in select) noAgg = AND(s.aggregate == "none" for s in select) if not isDeep and isSimple and noAgg: return True else: isSmooth = AND(( e.domain.type in domains.ALGEBRAIC and e.domain.interval == "none") for e in query.edges) if isSmooth: return True return False
def es_fieldop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(qb_expression_to_esfilter(query.where)) } } es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = qb_sort_to_es_sort(query.sort) es_query.fields = DictList() source = "fields" for s in select.value: if s == "*": es_query.fields=None source = "_source" elif s == ".": es_query.fields=None source = "_source" elif isinstance(s, basestring) and is_keyword(s): es_query.fields.append(s) elif isinstance(s, list) and es_query.fields is not None: es_query.fields.extend(s) elif isinstance(s, Mapping) and es_query.fields is not None: es_query.fields.extend(s.values()) elif es_query.fields is not None: es_query.fields.append(s) es_query.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort] return extract_rows(es, es_query, source, select, query)
def convert(self, expr): """ ADD THE ".$value" SUFFIX TO ALL VARIABLES """ if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_keyword(expr): #TODO: LOOKUP SCHEMA AND ADD ALL COLUMNS WITH THIS PREFIX return expr + ".$value" elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, Query): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({name: self.convert(value) for name, value in expr.items()}) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return self.converter_map.get(k, self._convert_bop)(k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr])
def convert(self, expr): """ EXPAND INSTANCES OF name TO value """ if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_keyword(expr): return coalesce(self.dimensions[expr], expr) elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, Query): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({name: self.convert(value) for name, value in expr.leaves()}) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return converter_map.get(k, self._convert_bop)(self, k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr]) else: return expr
def es_fieldop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(qb_expression_to_esfilter(query.where)) } } es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = qb_sort_to_es_sort(query.sort) es_query.fields = DictList() source = "fields" for s in select.value: if s == "*": es_query.fields = None source = "_source" elif s == ".": es_query.fields = None source = "_source" elif isinstance(s, basestring) and is_keyword(s): es_query.fields.append(s) elif isinstance(s, list) and es_query.fields is not None: es_query.fields.extend(s) elif isinstance(s, Mapping) and es_query.fields is not None: es_query.fields.extend(s.values()) elif es_query.fields is not None: es_query.fields.append(s) es_query.sort = [{ s.field: "asc" if s.sort >= 0 else "desc" } for s in query.sort] return extract_rows(es, es_query, source, select, query)
def es_setop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.fields = DictList() es_query.sort = qb_sort_to_es_sort(query.sort) source = "fields" for s in select: if s.value == "*": es_query.fields = None es_query.script_fields = None source = "_source" elif s.value == ".": es_query.fields = None es_query.script_fields = None source = "_source" elif isinstance(s.value, basestring) and is_keyword(s.value): es_query.fields.append(s.value) elif isinstance(s.value, list) and es_query.fields is not None: es_query.fields.extend(s.value) else: es_query.script_fields[literal_field(s.name)] = { "script": qb_expression_to_ruby(s.value) } return extract_rows(es, es_query, source, select, query)
def get_all_vars(expr): if expr == None: return set() elif isinstance(expr, unicode): if expr == "." or is_keyword(expr): return set([expr]) else: Log.error("Expecting a json path") elif expr is True: return set() elif expr is False: return set() elif Math.is_number(expr): return set() op, term = expr.items()[0] mop = ruby_multi_operators.get(op) if mop: if isinstance(term, list): output = set() for t in term: output |= get_all_vars(t) return output elif isinstance(term, Mapping): a, b = term.items()[0] return get_all_vars(a) | get_all_vars(b) else: get_all_vars(term) bop = ruby_binary_operators.get(op) if bop: if isinstance(term, list): output = set() for t in term: output |= get_all_vars(t) return output elif isinstance(term, Mapping): if op == "eq": output = set() for a, b in term.items(): output |= get_all_vars( a) # {k:v} k IS VARIABLE, v IS A VALUE return output else: a, b = term.items()[0] return get_all_vars(a) else: Log.error("Expecting binary term") uop = ruby_unary_operators.get(op) if uop: return get_all_vars(term) cop = complex_operators.get(op) if cop: return cop(op, term).vars() Log.error("`{{op}}` is not a recognized operation", op=op)
def get_all_vars(expr): if expr == None: return set() elif isinstance(expr, unicode): if expr == "." or is_keyword(expr): return set([expr]) else: Log.error("Expecting a json path") elif expr is True: return set() elif expr is False: return set() elif Math.is_number(expr): return set() op, term = expr.items()[0] mop = ruby_multi_operators.get(op) if mop: if isinstance(term, list): output = set() for t in term: output |= get_all_vars(t) return output elif isinstance(term, Mapping): a, b = term.items()[0] return get_all_vars(a) | get_all_vars(b) else: get_all_vars(term) bop = ruby_binary_operators.get(op) if bop: if isinstance(term, list): output = set() for t in term: output |= get_all_vars(t) return output elif isinstance(term, Mapping): if op == "eq": output = set() for a, b in term.items(): output |= get_all_vars(a) # {k:v} k IS VARIABLE, v IS A VALUE return output else: a, b = term.items()[0] return get_all_vars(a) else: Log.error("Expecting binary term") uop = ruby_unary_operators.get(op) if uop: return get_all_vars(term) cop = complex_operators.get(op) if cop: return cop(op, term).vars() Log.error("`{{op}}` is not a recognized operation", op= op)
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self._es.get_schema() # GET IDS OF DOCUMENTS results = self._es.search( { "fields": listwrap(schema._routing.path), "query": { "filtered": {"query": {"match_all": {}}, "filter": jx_expression(command.where).to_esfilter()} }, "size": 200000, } ) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = FlatList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_ruby()}) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append( { "update": { "_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)]), } } ) updates.append(s) content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode("utf-8") response = self._es.cluster.post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"}, timeout=self.settings.timeout, params={"consistency": self.settings.consistency}, ) if response.errors: Log.error( "could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)], )
def _get_nested_path(field, schema): if not INDEX_CACHE: _late_import() if is_keyword(field): field = join_field([schema.es.alias] + split_field(field)) for i, f in reverse(enumerate(split_field(field))): path = join_field(split_field(field)[0:i + 1:]) if path in INDEX_CACHE: return join_field(split_field(path)[1::]) return None
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if isinstance(where, Mapping): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception, e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = DictList() for k, v in where.terms.items(): if not isinstance(v, (list, set)): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if isinstance(edge, basestring): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception, e: Log.error("programmer error", e) fields = domain.dimension.fields if isinstance(fields, Mapping): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif isinstance(fields, list) and len(fields) == 1 and is_keyword( fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({ "or": [domain.getPartByKey(vv).esfilter for vv in v] }) return {"and": output}
def qb_expression(expr): """ WRAP A QB EXPRESSION WITH OBJECT REPRESENTATION """ if expr in (True, False, None) or expr == None or isinstance(expr, (float, int, Decimal)) or isinstance(expr, Date): return Literal(None, expr) elif is_keyword(expr): return Variable(expr) try: items = expr.items() except Exception, e: Log.error("programmer error expr = {{value|quote}}", value=expr, cause=e)
def select(self, select): selects = listwrap(select) if selects[0].value == "." and selects[0].name == ".": return self for s in selects: if not isinstance(s.value, basestring) or not is_keyword(s.value): Log.error("selecting on structure, or expressions, not supported yet") #TODO: DO THIS WITH JUST A SCHEMA TRANSFORM, DO NOT TOUCH DATA #TODO: HANDLE STRUCTURE AND EXPRESSIONS new_schema = {s.name: self.schema[s.value] for s in selects} new_data = [{s.name: d[s.value] for s in selects} for d in self.data] return ListContainer("from "+self.name, data=new_data, schema=new_schema)
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if isinstance(where, Mapping): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception, e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = DictList() for k, v in where.terms.items(): if not isinstance(v, (list, set)): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if isinstance(edge, basestring): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception, e: Log.error("programmer error", e) fields = domain.dimension.fields if isinstance(fields, Mapping): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif isinstance(fields, list) and len(fields) == 1 and is_keyword(fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({"or": [domain.getPartByKey(vv).esfilter for vv in v]}) return {"and": output}
def is_fieldop(query): # THESE SMOOTH EDGES REQUIRE ALL DATA (SETOP) select = listwrap(query.select) if not query.edges: isDeep = len(split_field(query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isSimple = AND(s.value != None and (s.value == "*" or is_keyword(s.value)) for s in select) noAgg = AND(s.aggregate == "none" for s in select) if not isDeep and isSimple and noAgg: return True else: isSmooth = AND((e.domain.type in domains.ALGEBRAIC and e.domain.interval == "none") for e in query.edges) if isSmooth: return True return False
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self._es.get_schema() # GET IDS OF DOCUMENTS results = self._es.search({ "fields": listwrap(schema._routing.path), "query": {"filtered": { "query": {"match_all": {}}, "filter": jx_expression(command.where).to_esfilter() }}, "size": 200000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = FlatList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_ruby()}) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append({"update": {"_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)])}}) updates.append(s) content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode('utf-8') response = self._es.cluster.post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"}, timeout=self.settings.timeout, params={"consistency": self.settings.consistency} ) if response.errors: Log.error("could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)])
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) # GET IDS OF DOCUMENTS results = self._es.search({ "fields": [], "query": { "filtered": { "query": { "match_all": {} }, "filter": _normalize_where(command.where, self) } }, "size": 200000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = DictList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") scripts.append("ctx._source." + k + " = " + expressions.qb_expression_to_ruby(v) + ";\n") script = "".join(scripts) if results.hits.hits: command = [] for id in results.hits.hits._id: command.append({"update": {"_id": id}}) command.append({"script": script}) content = ("\n".join(convert.value2json(c) for c in command) + "\n").encode('utf-8') self._es.cluster._post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"})
def convert(self, expr): """ ADD THE ".$value" SUFFIX TO ALL VARIABLES """ if isinstance(expr, Expression): vars_ = expr.vars() rename = { v: join_field(split_field(v) + ["$value"]) for v in vars_ } return expr.map(rename) if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_keyword(expr): #TODO: LOOKUP SCHEMA AND ADD ALL COLUMNS WITH THIS PREFIX return expr + ".$value" elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, QueryOp): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({ name: self.convert(value) for name, value in expr.items() }) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return self.converter_map.get(k, self._convert_bop)(k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr])
def es_aggop(es, mvel, query): select = listwrap(query.select) FromES = build_es_query(query) isSimple = AND(aggregates[s.aggregate] == "count" for s in select) if isSimple: return es_countop(es, query) # SIMPLE, USE TERMS FACET INSTEAD value2facet = dict() # ONLY ONE FACET NEEDED PER name2facet = dict() # MAP name TO FACET WITH STATS for s in select: if s.value not in value2facet: if is_keyword(s.value): unwrap(FromES.facets)[s.name] = { "statistical": { "field": s.value }, "facet_filter": simplify_esfilter(query.where) } else: unwrap(FromES.facets)[s.name] = { "statistical": { "script": es09.expressions.compile_expression(s.value, query) }, "facet_filter": simplify_esfilter(query.where) } value2facet[s.value] = s.name name2facet[s.name] = value2facet[s.value] data = es09.util.post(es, FromES, query.limit) matricies = { s.name: Matrix(value=fix_es_stats(data.facets[literal_field(s.name)])[ aggregates[s.aggregate]]) for s in select } cube = Cube(query.select, [], matricies) cube.frum = query return cube
def es_aggop(es, mvel, query): select = listwrap(query.select) FromES = build_es_query(query) isSimple = AND(aggregates[s.aggregate] == "count" for s in select) if isSimple: return es_countop(es, query) # SIMPLE, USE TERMS FACET INSTEAD value2facet = dict() # ONLY ONE FACET NEEDED PER name2facet = dict() # MAP name TO FACET WITH STATS for s in select: if s.value not in value2facet: if is_keyword(s.value): unwrap(FromES.facets)[s.name] = { "statistical": { "field": s.value }, "facet_filter": simplify_esfilter(query.where) } else: unwrap(FromES.facets)[s.name] = { "statistical": { "script": es09.expressions.compile_expression(s.value, query) }, "facet_filter": simplify_esfilter(query.where) } value2facet[s.value] = s.name name2facet[s.name] = value2facet[s.value] data = es09.util.post(es, FromES, query.limit) matricies = {s.name: Matrix(value=fix_es_stats(data.facets[literal_field(s.name)])[aggregates[s.aggregate]]) for s in select} cube = Cube(query.select, [], matricies) cube.frum = query return cube
def qb_expression_to_ruby(expr): if expr == None: return "nil" elif Math.is_number(expr): return unicode(expr) elif is_keyword(expr): return "doc[" + convert.string2quote(expr) + "].value" elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, CODE): return expr.code elif isinstance(expr, Date): return unicode(expr.unix) elif expr is True: return "true" elif expr is False: return "false" op, term = expr.items()[0] mop = ruby_multi_operators.get(op) if mop: if isinstance(term, list): if not term: return mop[1] # RETURN DEFAULT else: output = mop[0].join(["(" + qb_expression_to_ruby(t) + ")" for t in term]) return output elif isinstance(term, Mapping): a, b = term.items()[0] output = "(" + qb_expression_to_ruby(a) + ")" + mop[0] + "(" + qb_expression_to_ruby(b) + ")" return output else: qb_expression_to_ruby(term) bop = ruby_binary_operators.get(op) if bop: if isinstance(term, list): output = bop.join(["(" + qb_expression_to_ruby(t) + ")" for t in term]) return output elif isinstance(term, Mapping): if op == "eq": # eq CAN ACCEPT A WHOLE OBJECT OF key:value PAIRS TO COMPARE output = " and ".join("(" + qb_expression_to_ruby(a) + ")" + bop + "(" + qb_expression_to_ruby(b) + ")" for a, b in term.items()) return output else: a, b = term.items()[0] output = "(" + qb_expression_to_ruby(a) + ")" + bop + "(" + qb_expression_to_ruby(b) + ")" return output else: Log.error("Expecting binary term") uop = ruby_unary_operators.get(op) if uop: output = expand_template(uop, {"term": qb_expression_to_ruby(term)}) return output cop = complex_operators.get(op) if cop: output = cop(term).to_ruby() return output Log.error("`{{op}}` is not a recognized operation", op= op)
def qb_expression_to_python(expr): if expr == None: return "None" elif Math.is_number(expr): return unicode(expr) elif isinstance(expr, Date): return unicode(expr.unix) elif isinstance(expr, unicode): if expr == ".": return "row" elif is_keyword(expr): return "row[" + convert.value2quote(expr) + "]" else: Log.error("Expecting a json path") elif isinstance(expr, CODE): return expr.code elif expr is True: return "True" elif expr is False: return "False" op, term = expr.items()[0] mop = python_multi_operators.get(op) if mop: if isinstance(term, list): if not term: return mop[1] # RETURN DEFAULT else: output = mop[0].join(["(" + qb_expression_to_python(t) + ")" for t in term]) return output elif isinstance(term, Mapping): a, b = term.items()[0] output = "(" + qb_expression_to_python(a) + ")" + mop[0] + "(" + qb_expression_to_python(b) + ")" return output else: qb_expression_to_python(term) bop = python_binary_operators.get(op) if bop: if isinstance(term, list): output = bop.join(["(" + qb_expression_to_python(t) + ")" for t in term]) return output elif isinstance(term, Mapping): if op == "eq": # eq CAN ACCEPT A WHOLE OBJECT OF key:value PAIRS TO COMPARE output = " and ".join("(" + qb_expression_to_python(a) + ")" + bop + "(" + qb_expression_to_python(b) + ")" for a, b in term.items()) return output else: a, b = term.items()[0] output = "(" + qb_expression_to_python(a) + ")" + bop + "(" + qb_expression_to_python(b) + ")" return output else: Log.error("Expecting binary term") uop = python_unary_operators.get(op) if uop: output = uop + "(" + qb_expression_to_python(term) + ")" return output Log.error("`{{op}}` is not a recognized operation", op= op)
def qb_expression_to_ruby(expr): if expr == None: return "nil" elif Math.is_number(expr): return unicode(expr) elif is_keyword(expr): return "doc[" + convert.string2quote(expr) + "].value" elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, CODE): return expr.code elif isinstance(expr, Date): return unicode(expr.unix) elif expr is True: return "true" elif expr is False: return "false" op, term = expr.items()[0] mop = ruby_multi_operators.get(op) if mop: if isinstance(term, list): if not term: return mop[1] # RETURN DEFAULT else: output = mop[0].join( ["(" + qb_expression_to_ruby(t) + ")" for t in term]) return output elif isinstance(term, Mapping): a, b = term.items()[0] output = "(" + qb_expression_to_ruby( a) + ")" + mop[0] + "(" + qb_expression_to_ruby(b) + ")" return output else: qb_expression_to_ruby(term) bop = ruby_binary_operators.get(op) if bop: if isinstance(term, list): output = bop.join( ["(" + qb_expression_to_ruby(t) + ")" for t in term]) return output elif isinstance(term, Mapping): if op == "eq": # eq CAN ACCEPT A WHOLE OBJECT OF key:value PAIRS TO COMPARE output = " and ".join("(" + qb_expression_to_ruby(a) + ")" + bop + "(" + qb_expression_to_ruby(b) + ")" for a, b in term.items()) return output else: a, b = term.items()[0] output = "(" + qb_expression_to_ruby( a) + ")" + bop + "(" + qb_expression_to_ruby(b) + ")" return output else: Log.error("Expecting binary term") uop = ruby_unary_operators.get(op) if uop: output = expand_template(uop, {"term": qb_expression_to_ruby(term)}) return output cop = complex_operators.get(op) if cop: output = cop(term).to_ruby() return output Log.error("`{{op}}` is not a recognized operation", op=op)
def _range_composer(edge, domain, es_query, to_float): # USE RANGES _min = coalesce(domain.min, MAX(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if is_keyword(edge.value): calc = {"field": edge.value} else: calc = {"script": qb_expression_to_ruby(edge.value)} if edge.allowNulls: if is_keyword(edge.value): missing_range = { "or": [{ "range": { edge.value: { "lt": to_float(_min) } } }, { "range": { edge.value: { "gte": to_float(_max) } } }] } else: missing_range = { "script": { "script": qb_expression_to_ruby({ "or": [ { "lt": [edge.value, to_float(_min)] }, { "gt": [edge.value, to_float(_max)] }, ] }) } } missing_filter = set_default( { "filter": { "or": [ missing_range, { "missing": { "field": get_all_vars(edge.value) } } ] } }, es_query) else: missing_filter = None return wrap({ "aggs": { "_match": set_default({"range": calc}, { "range": { "ranges": [{ "from": to_float(p.min), "to": to_float(p.max) } for p in domain.partitions] } }, es_query), "_missing": missing_filter } })
def es_deepop(es, query): columns = query.frum.get_columns() query_path = query.frum.query_path columns = UniqueIndex(keys=["name"], data=sorted(columns, lambda a, b: cmp(len(listwrap(b.nested_path)), len(listwrap(a.nested_path)))), fail_on_dup=False) map_ = {c.name: c.abs_name for c in columns} map_to_local = { c.name: "_inner" + c.abs_name[len(listwrap(c.nested_path)[0]):] if c.nested_path else "fields." + literal_field(c.abs_name) for c in columns } # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es14.util.es_query_template(query.frum.name) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(qb_expression(query.where), query.frum, map_) for i, f in enumerate(es_filters): # PROBLEM IS {"match_all": {}} DOES NOT SURVIVE set_default() for k, v in unwrap(simplify_esfilter(AndOp("and", wheres[i]).to_esfilter())).items(): f[k] = v if not wheres[1]: more_filter = { "and": [ simplify_esfilter(AndOp("and", wheres[0]).to_esfilter()), {"not": { "nested": { "path": query_path, "filter": { "match_all": {} } } }} ] } else: more_filter = None es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = qb_sort_to_es_sort(query.sort) es_query.fields = [] is_list = isinstance(query.select, list) new_select = DictList() def get_pull(column): if column.nested_path: return "_inner" + column.abs_name[len(listwrap(column.nested_path)[0]):] else: return "fields." + literal_field(column.abs_name) i = 0 for s in listwrap(query.select): if s.value == "*": # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.abs_name] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": {"name": c.name, "index": i, "child": "."} }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS col_names = [c.name for c in columns if c.relative] for n in new_select: if n.name.startswith("..") and n.name.lstrip(".") not in col_names: n.name = n.put.name = n.name.lstrip(".") elif s.value == ".": for c in columns: if c.relative and c.type not in ["nested", "object"]: if not c.nested_path: es_query.fields += [c.abs_name] new_select.append({ "name": c.name, "pull": get_pull(c), "nested_path": listwrap(c.nested_path)[0], "put": {"name": ".", "index": i, "child": c.abs_name} }) i += 1 elif s.value == "_id": new_select.append({ "name": s.name, "value": s.value, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and s.value.endswith(".*") and is_keyword(s.value[:-2]): parent = s.value[:-1] prefix = len(parent) for c in columns: if c.name.startswith(parent) and c.type not in ["object", "nested"]: pull = get_pull(c) if len(listwrap(c.nested_path)) == 0: es_query.fields += [c.abs_name] new_select.append({ "name": s.name + "." + c.name[prefix:], "pull": pull, "nested_path": listwrap(c.nested_path)[0], "put": {"name": s.name + "." + literal_field(c.name[prefix:]), "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and is_keyword(s.value): parent = s.value + "." prefix = len(parent) net_columns = [c for c in columns if c.name.startswith(parent) and c.type not in ["object", "nested"]] if not net_columns: c = columns[(s.value,)] pull = get_pull(c) if not c.nested_path: es_query.fields += [s.value] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(c.nested_path)[0], "put": {"name": s.name, "index": i, "child": "."} }) else: for n in net_columns: pull = get_pull(n) if not n.nested_path: es_query.fields += [n.abs_name] new_select.append({ "name": s.name, "pull": pull, "nested_path": listwrap(n.nested_path)[0], "put": {"name": s.name, "index": i, "child": n.name[prefix:]} }) i += 1 else: expr = qb_expression(s.value) for v in expr.vars(): for n in columns: if n.name==v: if not n.nested_path: es_query.fields += [n.abs_name] pull = EXPRESSION_PREFIX + s.name post_expressions[pull] = compile_expression(expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.to_dict(), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append(es09.util.post( es, Dict( filter=more_filter, fields=es_query.fields ), query.limit )) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.es_response_time = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = FlatList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error("There is more than one open-ended edge: self can not be handled") specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_keyword(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges)*len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": {"aggregate": "count"}, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note("{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count= len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, FlatList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error("not implemented yet") # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = FlatList() constants = FlatList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({"name": fedge.domain.name, "value": parts[f]}) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_keyword(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_keyword(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter({"and": condition}) data = es09.util.post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = FlatList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index]+(special.dataIndex, )+pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def es_aggsop(es, frum, query): select = wrap([s.copy() for s in listwrap(query.select)]) es_query = Dict() new_select = Dict() # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and (s.value == None or s.value == "."): s.pull = "doc_count" elif s.value == ".": if frum.typed: # STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING if s.aggregate in NON_STATISTICAL_AGGS: # TODO: HANDLE BOTH $value AND $objects TO COUNT Log.error("do not know how to handle") else: s.value = "$value" new_select["$value"] += [s] else: if s.aggregate in NON_STATISTICAL_AGGS: # TODO: WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT Log.error("do not know how to handle") else: Log.error('Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate) elif is_keyword(s.value) and s.aggregate == "count": s.value = coalesce(frum[s.value].abs_name, s.value) new_select["count_" + literal_field(s.value)] += [s] elif is_keyword(s.value): s.value = coalesce(frum[s.value].abs_name, s.value) new_select[literal_field(s.value)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): representative = many[0] if representative.value == ".": Log.error("do not know how to handle") else: field_name = representative.value # canonical_name=literal_field(many[0].name) for s in many: if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.field = field_name s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = literal_field(canonical_name + " cardinality") es_query.aggs[key].cardinality.field = field_name s.pull = key + ".value" else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].stats.field = field_name s.pull = literal_field(canonical_name) + "." + aggregates1_4[s.aggregate] for i, s in enumerate(formula): canonical_name = literal_field(s.name) abs_value = qb_expression(s.value).map({c.name: c.abs_name for c in frum._columns}) if s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = abs_value.to_ruby() s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = abs_value.to_ruby() s.pull = key + ".value" else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = canonical_name + "." + aggregates1_4[s.aggregate] es_query.aggs[canonical_name].stats.script = abs_value.to_ruby() decoders = get_decoders_by_depth(query) start = 0 vars_ = qb_expression(query.where).vars() map_ = {v: frum[v].abs_name for v in vars_} # <TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(qb_expression(query.where), schema=frum, map_=map_) if len(split_field(frum.name)) > 1: if any(split_where[2:]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: # TODO: INCLUDE FILTERS ON EDGES filter_ = simplify_esfilter(AndOp("and", split_where[1]).to_esfilter()) es_query = Dict(aggs={"_filter": set_default({"filter": filter_}, es_query)}) es_query = wrap({"aggs": {"_nested": set_default({"nested": {"path": frum.query_path}}, es_query)}}) else: if any(split_where[1:]): Log.error("Where clause is too deep") for d in decoders[0]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: # TODO: INCLUDE FILTERS ON EDGES filter = simplify_esfilter(AndOp("and", split_where[0]).to_esfilter()) es_query = Dict(aggs={"_filter": set_default({"filter": filter}, es_query)}) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce( result.aggregations.doc_count, result.hits.total ) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.es_response_time = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)
def buildCondition(mvel, edge, partition): """ RETURN AN ES FILTER OBJECT """ output = {} if edge.domain.isFacet: # MUST USE THIS' esFacet condition = wrap(coalesce(partition.where, {"and": []})) if partition.min and partition.max and is_keyword(edge.value): condition["and"].append({ "range": {edge.value: {"gte": partition.min, "lt": partition.max}} }) # ES WILL FREAK OUT IF WE SEND {"not":{"and":x}} (OR SOMETHING LIKE THAT) return simplify_esfilter(condition) elif edge.range: # THESE REALLY NEED FACETS TO PERFORM THE JOIN-TO-DOMAIN # USE MVEL CODE if edge.domain.type in domains.ALGEBRAIC: output = {"and": []} if edge.range.mode and edge.range.mode == "inclusive": # IF THE range AND THE partition OVERLAP, THEN MATCH IS MADE if is_keyword(edge.range.min): output["and"].append({"range": {edge.range.min: {"lt": es09.expressions.value2value(partition.max)}}}) else: # WHOA!! SUPER SLOW!! output["and"].append({"script": {"script": mvel.compile_expression( edge.range.min + " < " + es09.expressions.value2MVEL(partition.max) )}}) if is_keyword(edge.range.max): output["and"].append({"or": [ {"missing": {"field": edge.range.max}}, {"range": {edge.range.max, {"gt": es09.expressions.value2value(partition.min)}}} ]}) else: # WHOA!! SUPER SLOW!! output["and"].append({"script": {"script": mvel.compile_expression( edge.range.max + " > " + es09.expressions.value2MVEL(partition.min))}}) else: # SNAPSHOT - IF range INCLUDES partition.min, THEN MATCH IS MADE if is_keyword(edge.range.min): output["and"].append({"range": {edge.range.min: {"lte": es09.expressions.value2value(partition.min)}}}) else: # WHOA!! SUPER SLOW!! output["and"].append({"script": {"script": mvel.compile_expression( edge.range.min + "<=" + es09.expressions.value2MVEL(partition.min) )}}) if is_keyword(edge.range.max): output["and"].append({"or": [ {"missing": {"field": edge.range.max}}, {"range": {edge.range.max, {"gte": es09.expressions.value2value(partition.min)}}} ]}) else: # WHOA!! SUPER SLOW!! output["and"].append({"script": {"script": mvel.compile_expression( es09.expressions.value2MVEL(partition.min) + " <= " + edge.range.max )}}) return output else: Log.error("Do not know how to handle range query on non-continuous domain") elif not edge.value: # MUST USE THIS' esFacet, AND NOT(ALL THOSE ABOVE) return partition.esfilter elif is_keyword(edge.value): # USE FAST ES SYNTAX if edge.domain.type in domains.ALGEBRAIC: output.range = {} output.range[edge.value] = {"gte": es09.expressions.value2query(partition.min), "lt": es09.expressions.value2query(partition.max)} elif edge.domain.type == "set": if partition.value: if partition.value != edge.domain.getKey(partition): Log.error("please ensure the key attribute of the domain matches the value attribute of all partitions, if only because we are now using the former") # DEFAULT TO USING THE .value ATTRIBUTE, IF ONLY BECAUSE OF LEGACY REASONS output.term = {edge.value: partition.value} else: output.term = {edge.value: edge.domain.getKey(partition)} elif edge.domain.type == "default": output.term = dict() output.term[edge.value] = partition.value else: Log.error("Edge \"" + edge.name + "\" is not supported") return output else: # USE MVEL CODE if edge.domain.type in domains.ALGEBRAIC: output.script = {"script": edge.value + ">=" + es09.expressions.value2MVEL(partition.min) + " and " + edge.value + "<" + es09.expressions.value2MVEL(partition.max)} else: output.script = {"script": "( " + edge.value + " ) ==" + es09.expressions.value2MVEL(partition.value)} code = es09.expressions.addFunctions(output.script.script) output.script.script = code.head + code.body return output
def es_terms_stats(esq, mvel, query): select = listwrap(query.select) facetEdges = [] # EDGES THAT WILL REQUIRE A FACET FOR EACH PART termsEdges = DictList() specialEdge = None special_index = -1 # A SPECIAL EDGE IS ONE THAT HAS AN UNDEFINED NUMBER OF PARTITIONS AT QUERY TIME # FIND THE specialEdge, IF ONE for f, tedge in enumerate(query.edges): if tedge.domain.type in domains.KNOWN: for p, part in enumerate(tedge.domain.partitions): part.dataIndex = p # FACETS ARE ONLY REQUIRED IF SQL JOIN ON DOMAIN IS REQUIRED (RANGE QUERY) # OR IF WE ARE NOT SIMPLY COUNTING # OR IF NO SCRIPTING IS ALLOWED (SOME OTHER CODE IS RESPONSIBLE FOR SETTING isFacet) # OR IF WE JUST WANT TO FORCE IT :) # OF COURSE THE default EDGE IS NOT EXPLICIT, SO MUST BE A TERM facetEdges.append(tedge) else: if specialEdge: Log.error("There is more than one open-ended edge: self can not be handled") specialEdge = tedge special_index = f termsEdges.append(tedge) if not specialEdge: # WE SERIOUSLY WANT A SPECIAL EDGE, OTHERWISE WE WILL HAVE TOO MANY FACETS # THE BIGGEST EDGE MAY BE COLLAPSED TO A TERM, MAYBE? num_parts = 0 special_index = -1 for i, e in enumerate(facetEdges): l = len(e.domain.partitions) if ((e.value and is_keyword(e.value)) or len(e.domain.dimension.fields) == 1) and l > num_parts: num_parts = l specialEdge = e special_index = i facetEdges.pop(special_index) termsEdges.append(specialEdge) total_facets = PRODUCT(len(f.domain.partitions) for f in facetEdges)*len(select) if total_facets > 100: # WE GOT A PROBLEM, LETS COUNT THE SIZE OF REALITY: counts = esq.query({ "from": query.frum, "select": {"aggregate": "count"}, "edges": facetEdges, "where": query.where, "limit": query.limit }) esFacets = [] def add_facet(value, parts, cube): if value: esFacets.append(parts) counts["count"].forall(add_facet) Log.note("{{theory_count}} theoretical combinations, {{real_count}} actual combos found", real_count= len(esFacets), theory_count=total_facets) if not esFacets: # MAKE EMPTY CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube else: # GENERATE ALL COMBOS esFacets = getAllEdges(facetEdges) calcTerm = compileEdges2Term(mvel, termsEdges, DictList()) term2parts = calcTerm.term2parts if len(esFacets) * len(select) > 1000: Log.error("not implemented yet") # WE HAVE SOME SERIOUS PERMUTATIONS, WE MUST ISSUE MULTIPLE QUERIES pass FromES = build_es_query(query) for s in select: for parts in esFacets: condition = DictList() constants = DictList() name = [literal_field(s.name)] for f, fedge in enumerate(facetEdges): name.append(str(parts[f].dataIndex)) condition.append(buildCondition(mvel, fedge, parts[f])) constants.append({"name": fedge.domain.name, "value": parts[f]}) condition.append(query.where) name = ",".join(name) FromES.facets[name] = { "terms_stats": { "key_field": calcTerm.field, "value_field": s.value if is_keyword(s.value) else None, "value_script": mvel.compile_expression(s.value) if not is_keyword(s.value) else None, "size": coalesce(query.limit, 200000) } } if condition: FromES.facets[name].facet_filter = simplify_esfilter({"and": condition}) data = es09.util.post(esq.es, FromES, query.limit) if specialEdge.domain.type not in domains.KNOWN: # WE BUILD THE PARTS BASED ON THE RESULTS WE RECEIVED partitions = DictList() map = {} for facetName, parts in data.facets.items(): for stats in parts.terms: if not map[stats]: part = {"value": stats, "name": stats} partitions.append(part) map[stats] = part partitions.sort(specialEdge.domain.compare) for p, part in enumerate(partitions): part.dataIndex = p specialEdge.domain.map = map specialEdge.domain.partitions = partitions # MAKE CUBE matricies = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: matricies[s.name] = Matrix(*dims) name2agg = {s.name: aggregates[s.aggregate] for s in select} # FILL CUBE for edgeName, parts in data.facets.items(): temp = edgeName.split(",") pre_coord = tuple(int(c) for c in temp[1:]) sname = temp[0] for stats in parts.terms: if specialEdge: special = term2parts(stats.term)[0] coord = pre_coord[:special_index]+(special.dataIndex, )+pre_coord[special_index:] else: coord = pre_coord matricies[sname][coord] = stats[name2agg[sname]] cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def test_good_var(self): self.assertTrue( is_keyword(u'_a._b'), "That's a good variable name!" )
def test_error_on_bad_var(self): self.assertFalse( is_keyword(u'coalesce(rows[rownum+1].timestamp, Date.eod())'), "That's not a valid variable name!!" )
def __init__(self, var): Expression.__init__(self, "", None) if not is_keyword(var): Log.error("Expecting a variable") self.var = var
def extract_rows(es, es_query, query): is_list = isinstance(query.select, list) select = wrap([s.copy() for s in listwrap(query.select)]) new_select = DictList() column_names = set(c.name for c in query.frum.get_columns() if c.type not in ["object"] and (not c.nested_path or c.abs_name == c.nested_path or not c.nested_path)) source = "fields" i = 0 for s in select: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if s.value == "*": es_query.fields = None source = "_source" net_columns = column_names - set(select.name) for n in net_columns: new_select.append({ "name": n, "value": n, "put": {"name": n, "index": i, "child": "."} }) i += 1 elif s.value == ".": es_query.fields = None source = "_source" new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif s.value == "_id": new_select.append({ "name": s.name, "value": s.value, "pull": "_id", "put": {"name": s.name, "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and s.value.endswith(".*") and is_keyword(s.value[:-2]): parent = s.value[:-1] prefix = len(parent) for c in column_names: if c.startswith(parent): if es_query.fields is not None: es_query.fields.append(c) new_select.append({ "name": s.name + "." + c[prefix:], "value": c, "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."} }) i += 1 elif isinstance(s.value, basestring) and is_keyword(s.value): parent = s.value + "." prefix = len(parent) net_columns = [c for c in column_names if c.startswith(parent)] if not net_columns: if es_query.fields is not None: es_query.fields.append(s.value) new_select.append({ "name": s.name, "value": s.value, "put": {"name": s.name, "index": i, "child": "."} }) else: for n in net_columns: if es_query.fields is not None: es_query.fields.append(n) new_select.append({ "name": s.name, "value": n, "put": {"name": s.name, "index": i, "child": n[prefix:]} }) i += 1 elif isinstance(s.value, list): Log.error("need an example") if es_query.fields is not None: es_query.fields.extend([v for v in s.value]) else: es_query.script_fields[literal_field(s.name)] = {"script": qb_expression(s.value).to_ruby()} new_select.append({ "name": s.name, "pull": "fields." + literal_field(s.name), "put": {"name": s.name, "index": i, "child": "."} }) i += 1 for n in new_select: if n.pull: continue if source == "_source": n.pull = join_field(["_source"] + split_field(n.value)) else: n.pull = "fields." + literal_field(n.value) with Timer("call to ES") as call_timer: data = es09.util.post(es, es_query, query.limit) T = data.hits.hits try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(T, new_select, query) output.meta.es_response_time = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: Log.error("problem formatting", e)
def es_aggsop(es, frum, query): select = listwrap(query.select) es_query = Dict() new_select = Dict() formula = [] for s in select: if s.aggregate == "count" and (s.value == None or s.value == "."): s.pull = "doc_count" elif is_keyword(s.value): new_select[literal_field(s.value)] += [s] else: formula.append(s) for litral_field, many in new_select.items(): if len(many)>1: canonical_name=literal_field(many[0].name) es_query.aggs[canonical_name].stats.field = many[0].value for s in many: if s.aggregate == "count": s.pull = canonical_name + ".count" else: s.pull = canonical_name + "." + aggregates1_4[s.aggregate] else: s = many[0] s.pull = literal_field(s.value) + ".value" es_query.aggs[literal_field(s.value)][aggregates1_4[s.aggregate]].field = s.value for i, s in enumerate(formula): new_select[unicode(i)] = s s.pull = literal_field(s.name) + ".value" es_query.aggs[literal_field(s.name)][aggregates1_4[s.aggregate]].script = qb_expression_to_ruby(s.value) decoders = [AggsDecoder(e, query) for e in coalesce(query.edges, query.groupby, [])] start = 0 for d in decoders: es_query = d.append_query(es_query, start) start += d.num_columns if query.where: filter = simplify_esfilter(query.where) es_query = Dict( aggs={"_filter": set_default({"filter": filter}, es_query)} ) if len(split_field(frum.name)) > 1: es_query = wrap({ "size": 0, "aggs": {"_nested": set_default({ "nested": { "path": join_field(split_field(frum.name)[1::]) } }, es_query)} }) with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.es_response_time = es_duration.seconds output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format= query.format, cause=e) Log.error("Some problem", e)
def qb_expression_to_python(expr): if expr == None: return "None" elif Math.is_number(expr): return unicode(expr) elif isinstance(expr, Date): return unicode(expr.unix) elif isinstance(expr, unicode): if expr == ".": return "row" elif is_keyword(expr): return "row[" + convert.value2quote(expr) + "]" else: Log.error("Expecting a json path") elif isinstance(expr, CODE): return expr.code elif expr is True: return "True" elif expr is False: return "False" op, term = expr.items()[0] mop = python_multi_operators.get(op) if mop: if isinstance(term, list): if not term: return mop[1] # RETURN DEFAULT else: output = mop[0].join( ["(" + qb_expression_to_python(t) + ")" for t in term]) return output elif isinstance(term, Mapping): a, b = term.items()[0] output = "(" + qb_expression_to_python( a) + ")" + mop[0] + "(" + qb_expression_to_python(b) + ")" return output else: qb_expression_to_python(term) bop = python_binary_operators.get(op) if bop: if isinstance(term, list): output = bop.join( ["(" + qb_expression_to_python(t) + ")" for t in term]) return output elif isinstance(term, Mapping): if op == "eq": # eq CAN ACCEPT A WHOLE OBJECT OF key:value PAIRS TO COMPARE output = " and ".join("(" + qb_expression_to_python(a) + ")" + bop + "(" + qb_expression_to_python(b) + ")" for a, b in term.items()) return output else: a, b = term.items()[0] output = "(" + qb_expression_to_python( a) + ")" + bop + "(" + qb_expression_to_python(b) + ")" return output else: Log.error("Expecting binary term") uop = python_unary_operators.get(op) if uop: output = uop + "(" + qb_expression_to_python(term) + ")" return output Log.error("`{{op}}` is not a recognized operation", op=op)
def es_aggsop(es, frum, query): select = listwrap(query.select) es_query = Dict() new_select = Dict() formula = [] for s in select: if s.aggregate == "count" and (s.value == None or s.value == "."): s.pull = "doc_count" elif is_keyword(s.value): new_select[literal_field(s.value)] += [s] else: formula.append(s) for litral_field, many in new_select.items(): if len(many) > 1: canonical_name = literal_field(many[0].name) es_query.aggs[canonical_name].stats.field = many[0].value for s in many: if s.aggregate == "count": s.pull = canonical_name + ".count" else: s.pull = canonical_name + "." + aggregates1_4[s.aggregate] else: s = many[0] s.pull = literal_field(s.value) + ".value" es_query.aggs[literal_field( s.value)][aggregates1_4[s.aggregate]].field = s.value for i, s in enumerate(formula): new_select[unicode(i)] = s s.pull = literal_field(s.name) + ".value" es_query.aggs[literal_field(s.name)][aggregates1_4[ s.aggregate]].script = qb_expression_to_ruby(s.value) decoders = [ AggsDecoder(e, query) for e in coalesce(query.edges, query.groupby, []) ] start = 0 for d in decoders: es_query = d.append_query(es_query, start) start += d.num_columns if query.where: filter = simplify_esfilter(query.where) es_query = Dict( aggs={"_filter": set_default({"filter": filter}, es_query)}) if len(split_field(frum.name)) > 1: es_query = wrap({ "size": 0, "aggs": { "_nested": set_default( { "nested": { "path": join_field(split_field(frum.name)[1::]) } }, es_query) } }) with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[ query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.es_response_time = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception, e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)
def _map_term_using_schema(master, path, term, schema_edges): """ IF THE WHERE CLAUSE REFERS TO FIELDS IN THE SCHEMA, THEN EXPAND THEM """ output = DictList() for k, v in term.items(): dimension = schema_edges[k] if isinstance(dimension, Dimension): domain = dimension.getDomain() if dimension.fields: if isinstance(dimension.fields, Mapping): # EXPECTING A TUPLE for local_field, es_field in dimension.fields.items(): local_value = v[local_field] if local_value == None: output.append({"missing": {"field": es_field}}) else: output.append({"term": {es_field: local_value}}) continue if len(dimension.fields) == 1 and is_keyword( dimension.fields[0]): # SIMPLE SINGLE-VALUED FIELD if domain.getPartByKey(v) is domain.NULL: output.append( {"missing": { "field": dimension.fields[0] }}) else: output.append({"term": {dimension.fields[0]: v}}) continue if AND(is_keyword(f) for f in dimension.fields): # EXPECTING A TUPLE if not isinstance(v, tuple): Log.error("expecing {{name}}={{value}} to be a tuple", name=k, value=v) for i, f in enumerate(dimension.fields): vv = v[i] if vv == None: output.append({"missing": {"field": f}}) else: output.append({"term": {f: vv}}) continue if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]): if domain.getPartByKey(v) is domain.NULL: output.append({"missing": {"field": dimension.fields[0]}}) else: output.append({"term": {dimension.fields[0]: v}}) continue if domain.partitions: part = domain.getPartByKey(v) if part is domain.NULL or not part.esfilter: Log.error("not expected to get NULL") output.append(part.esfilter) continue else: Log.error("not expected") elif isinstance(v, Mapping): sub = _map_term_using_schema(master, path + [k], v, schema_edges[k]) output.append(sub) continue output.append({"term": {k: v}}) return {"and": output}