def __init__(self, name, params, cwd=None, env=None, debug=False, shell=False, bufsize=-1): self.name = name self.service_stopped = Signal("stopped signal for " + strings.quote(name)) self.stdin = Queue("stdin for process " + strings.quote(name), silent=True) self.stdout = Queue("stdout for process " + strings.quote(name), silent=True) self.stderr = Queue("stderr for process " + strings.quote(name), silent=True) try: self.debug = debug or DEBUG self.service = service = subprocess.Popen( params, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=bufsize, cwd=cwd if isinstance(cwd, (basestring, NullType, NoneType)) else cwd.abspath, env=unwrap(set_default(env, os.environ)), shell=shell ) self.please_stop = Signal() self.please_stop.on_go(self._kill) self.thread_locker = Lock() self.children = [ Thread.run(self.name + " stdin", self._writer, service.stdin, self.stdin, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " stdout", self._reader, "stdout", service.stdout, self.stdout, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " stderr", self._reader, "stderr", service.stderr, self.stderr, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " waiter", self._monitor, parent_thread=self), ] except Exception as e: Log.error("Can not call", e) if self.debug: Log.note("{{process}} START: {{command}}", process=self.name, command=" ".join(map(strings.quote, params)))
def quote_column(column_name, table=None): if not isinstance(column_name, unicode): Log.error("expecting a name") if table != None: return SQL(quote(table) + "." + quote(column_name)) else: if _no_need_to_quote.match(column_name): return SQL(column_name) return SQL(quote(column_name))
def _convert(v): if v is None: return NULL.to_es_script(schema) if v is True: return EsScript( type=BOOLEAN, expr="true", frum=self ) if v is False: return EsScript( type=BOOLEAN, expr="false", frum=self ) if isinstance(v, text_type): return EsScript( type=STRING, expr=quote(v), frum=self ) if isinstance(v, int): return EsScript( type=INTEGER, expr=text_type(v), frum=self ) if isinstance(v, float): return EsScript( type=NUMBER, expr=text_type(v), frum=self ) if isinstance(v, dict): return EsScript( type=OBJECT, expr="[" + ", ".join(quote(k) + ": " + _convert(vv) for k, vv in v.items()) + "]", frum=self ) if isinstance(v, (list, tuple)): return EsScript( type=OBJECT, expr="[" + ", ".join(_convert(vv).expr for vv in v) + "]", frum=self ) if isinstance(v, Date): return EsScript( type=NUMBER, expr=text_type(v.unix), frum=self )
def to_es_script(self, schema, not_null=False, boolean=False, many=True): if is_op(self.expr, Variable_): if self.expr.var == "_id": return EsScript(type=BOOLEAN, expr="false", frum=self, schema=schema) else: columns = schema.leaves(self.expr.var) return ( AndOp( [ EsScript( type=BOOLEAN, expr="doc[" + quote(c.es_column) + "].empty", frum=self, schema=schema, ) for c in columns ] ) .partial_eval() .to_es_script(schema) ) elif is_literal(self.expr): return self.expr.missing().to_es_script(schema) else: return self.expr.missing().partial_eval().to_es_script(schema)
def to_python(self, not_null=False, boolean=False, many=False): return ( "re.match(" + quote(json2value(self.pattern.json) + "$") + ", " + Python[self.var].to_python() + ")" )
def unicode_key(key): """ CONVERT PROPERTY VALUE TO QUOTED NAME OF SAME """ if not isinstance(key, (text, binary_type)): from mo_logs import Log Log.error("{{key|quote}} is not a valid key", key=key) return quote(text(key))
def unicode_key(key): """ CONVERT PROPERTY VALUE TO QUOTED NAME OF SAME """ if not isinstance(key, (text_type, binary_type)): from mo_logs import Log Log.error("{{key|quote}} is not a valid key", key=key) return quote(text_type(key))
def to_python(self, not_null=False, boolean=False, many=False): return ( "wrap_leaves({" + ','.join( quote(t['name']) + ":" + t['value'].to_python() for t in self.terms ) + "})" )
def _convert(v): if v is None: return NULL.to_ruby(schema) if v is True: return Ruby( type=BOOLEAN, expr="true", frum=self ) if v is False: return Ruby( type=BOOLEAN, expr="false", frum=self ) if isinstance(v, text_type): return Ruby( type=STRING, expr=quote(v), frum=self ) if isinstance(v, int): return Ruby( type=INTEGER, expr=text_type(v), frum=self ) if isinstance(v, float): return Ruby( type=NUMBER, expr=text_type(v), frum=self ) if isinstance(v, dict): return Ruby( type=OBJECT, expr="[" + ", ".join(quote(k) + ": " + _convert(vv) for k, vv in v.items()) + "]", frum=self ) if isinstance(v, (list, tuple)): return Ruby( type=OBJECT, expr="[" + ", ".join(_convert(vv).expr for vv in v) + "]", frum=self )
def to_python(self, not_null=False, boolean=False, many=False): return ( "wrap_leaves({" + ",".join( quote(t["name"]) + ":" + Python[t["value"]].to_python() for t in self.terms ) + "})" )
def value2query(value): if isinstance(value, datetime): return convert.datetime2milli(value) if isinstance(value, Duration): return value.milli if Math.is_number(value): return value return quote(value)
def value2query(value): if isinstance(value, datetime): return convert.datetime2milli(value) if isinstance(value, Duration): return value.milli if Math.is_number(value): return value return quote(value)
def quote_column(*path): if not path: Log.error("expecting a name") if any(not is_text(p) for p in path): Log.error("expecting strings, not SQL") try: return ConcatSQL((SQL_SPACE, JoinSQL(SQL_DOT, [SQL(quote(p)) for p in path]), SQL_SPACE)) except Exception as e: Log.error("Not expacted", cause=e)
def to_python(self, not_null=False, boolean=False, many=False): return ( "leaves_to_data({" + ",".join( quote(t["name"]) + ":" + Python[t["value"]].to_python() for t in self.terms ) + "})" )
def _convert(v): if v is None: return null_script if v is True: return true_script if v is False: return false_script class_ = v.__class__ if class_ is text_type: return EsScript(type=STRING, expr=quote(v), frum=self, schema=schema) if class_ in integer_types: if MIN_INT32 <= v <= MAX_INT32: return EsScript( type=INTEGER, expr=text_type(v), frum=self, schema=schema ) else: return EsScript( type=INTEGER, expr=text_type(v) + "L", frum=self, schema=schema ) if class_ is float: return EsScript( type=NUMBER, expr=text_type(v) + "D", frum=self, schema=schema ) if class_ in data_types: return EsScript( type=OBJECT, expr="[" + ", ".join(quote(k) + ": " + _convert(vv) for k, vv in v.items()) + "]", frum=self, schema=schema, ) if class_ in (FlatList, list, tuple): return EsScript( type=OBJECT, expr="[" + ", ".join(_convert(vv).expr for vv in v) + "]", frum=self, schema=schema, ) if class_ is Date: return EsScript( type=NUMBER, expr=text_type(v.unix), frum=self, schema=schema )
def to_ruby(self, schema, not_null=False, boolean=True): if isinstance(self.expr, Variable): if self.expr.var == "_id": return Ruby(type=BOOLEAN, expr="false", frum=self) else: columns = schema.leaves(self.expr.var) if len(columns) == 1: return Ruby(type=BOOLEAN, expr="doc[" + quote(columns[0].es_column) + "].isEmpty()", frum=self) else: return AndOp("and", [ Ruby( type=BOOLEAN, expr="doc[" + quote(c.es_column) + "].isEmpty()", frum=self ) for c in columns ]).partial_eval().to_ruby(schema) else: return self.expr.missing().partial_eval().to_ruby(schema)
def to_python(self, not_null=False, boolean=False, many=False): return ( "((" + quote(self.substring) + " in " + Python[self.var].to_python() + ") if " + Python[self.var].to_python() + "!=None else False)" )
def append_query(self, es_query, start): # TODO: USE "reverse_nested" QUERY TO PULL THESE self.start = start for i, v in enumerate(self.values): es_query = wrap({ "aggs": { "_match": set_default( { "terms": { "script": 'doc[' + quote(self.var) + '].values.contains(' + quote(v) + ') ? 1 : 0' } }, es_query) } }) return es_query
def append_query(self, es_query, start): self.start = start es_field = self.query.frum.schema.leaves(self.var)[0].es_column es_query = wrap({"aggs": { "_match": set_default({"terms": { "script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'}) }}, es_query) }}) return es_query
def append_query(self, query_path, es_query): es_field = first(self.query.frum.schema.leaves(self.var)).es_column return Aggs().add( TermsAggs( "_match", { "script": expand_template( LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'}) }, self).add(es_query))
def append_query(self, es_query, start): self.start = start es_field = self.query.frum.schema.leaves(self.var)[0].es_column es_query = wrap({"aggs": { "_match": set_default({"terms": { "script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'}) }}, es_query) }}) return es_query
def test_unicode1(self): value = { "comment": u"Open all links in the current tab, except the pages opened from external apps — open these ones in new windows" } test1 = typed_encode(value) expected = u'{"comment":{' + quote( STRING_KEY ) + u':"Open all links in the current tab, except the pages opened from external apps — open these ones in new windows"},' + quote( EXISTS_KEY) + u':1}' self.assertEqual(test1, expected)
def value2MVEL(value): """ FROM PYTHON VALUE TO MVEL EQUIVALENT """ if isinstance(value, datetime): return str(convert.datetime2milli(value)) + " /*" + value.format("yyNNNdd HHmmss") + "*/" # TIME if isinstance(value, Duration): return str(convert.timedelta2milli(value)) + " /*" + str(value) + "*/" # DURATION if Math.is_number(value): return str(value) return quote(value)
def to_es14_script(self, schema, not_null=False, boolean=False, many=True): if isinstance(self.expr, Variable): if self.expr.var == "_id": return EsScript(type=BOOLEAN, expr="false", frum=self) else: columns = schema.leaves(self.expr.var) if len(columns) == 1: return EsScript(type=BOOLEAN, expr="doc[" + quote(first(columns).es_column) + "].isEmpty()", frum=self) else: return AndOp("and", [ EsScript( type=BOOLEAN, expr="doc[" + quote(c.es_column) + "].isEmpty()", frum=self ) for c in columns ]).partial_eval().to_es14_script(schema) elif isinstance(self.expr, Literal): return self.expr.missing().to_es14_script(schema) else: return self.expr.missing().partial_eval().to_es14_script(schema)
def to_es_script(self, schema, not_null=False, boolean=True): if isinstance(self.expr, Variable): if self.expr.var == "_id": return EsScript(type=BOOLEAN, expr="false", frum=self) else: columns = schema.leaves(self.expr.var) if len(columns) == 1: return EsScript(type=BOOLEAN, expr="doc[" + quote(columns[0].es_column) + "].isEmpty()", frum=self) else: return AndOp("and", [ EsScript( type=BOOLEAN, expr="doc[" + quote(c.es_column) + "].isEmpty()", frum=self ) for c in columns ]).partial_eval().to_es_script(schema) elif isinstance(self.expr, Literal): return self.expr.missing().to_es_script(schema) else: return self.expr.missing().partial_eval().to_es_script(schema)
def value2MVEL(value): """ FROM PYTHON VALUE TO MVEL EQUIVALENT """ if isinstance(value, datetime): return str(convert.datetime2milli(value)) + " /*" + value.format("yyNNNdd HHmmss") + "*/" # TIME if isinstance(value, Duration): return str(convert.timedelta2milli(value)) + " /*" + str(value) + "*/" # DURATION if Math.is_number(value): return str(value) return quote(value)
def append_query(self, es_query, start): self.start = start es_field = self.query.frum.schema.leaves(self.var)[0].es_column for i, v in enumerate(self.values): es_query = wrap({"aggs": { "_match": set_default({"terms": { "script": 'doc['+quote(es_field)+'].values.contains(' + value2json(v) + ') ? 1 : 0' }}, es_query) }}) return es_query
def md5(source, chunk_size=CHUNK_SIZE): md5s = [] for g, data in jx.chunk(source.read_bytes(), size=chunk_size): md5s.append(hashlib.md5(data).digest()) if len(md5s) == 0: return '"d41d8cd98f00b204e9800998ecf8427e"' elif len(md5s) == 1: return quote(md5s[0].encode("hex")) else: Log.warning("not known to work") new_md5 = hashlib.md5(b"".join(md5s)) return unicode(new_md5.hexdigest() + b"-" + str(len(md5s)))
def to_es_script(self, schema, not_null=False, boolean=False, many=True): if self.var == ".": return EsScript(type=OBJECT, expr="_source", frum=self) else: if self.var == "_id": return EsScript( type=STRING, expr='doc["_uid"].value.substring(doc["_uid"].value.indexOf(\'#\')+1)', frum=self, schema=schema, ) columns = schema.values(self.var) acc = [] for c in columns: varname = c.es_column frum = Variable(c.es_column) q = quote(varname) if many: acc.append( EsScript( miss=frum.missing(), type=c.jx_type, expr="doc[" + q + "].values" if c.jx_type != BOOLEAN else "doc[" + q + "].value", frum=frum, schema=schema, many=c.jx_type != BOOLEAN, ) ) else: acc.append( EsScript( miss=frum.missing(), type=c.jx_type, expr="doc[" + q + "].value" if c.jx_type != BOOLEAN else "doc[" + q + "].value", frum=frum, schema=schema, many=True, ) ) if len(acc) == 0: return NULL.to_es_script(schema) elif len(acc) == 1: return acc[0] else: return CoalesceOp(acc).to_es_script(schema)
def to_es_script(self, schema, not_null=False, boolean=False, many=True): if self.var == ".": return EsScript(type=OBJECT, expr="_source", frum=self) else: if self.var == "_id": return EsScript( type=STRING, expr='doc["_uid"].value.substring(doc["_uid"].value.indexOf(\'#\')+1)', frum=self, schema=schema, ) columns = schema.values(self.var) acc = [] for c in columns: varname = c.es_column frum = Variable(c.es_column) q = quote(varname) if many: acc.append( EsScript( miss=frum.missing(), type=c.jx_type, expr="doc[" + q + "].values" if c.jx_type != BOOLEAN else "doc[" + q + "].value", frum=frum, schema=schema, many=c.jx_type != BOOLEAN, ) ) else: acc.append( EsScript( miss=frum.missing(), type=c.jx_type, expr="doc[" + q + "].value" if c.jx_type != BOOLEAN else "doc[" + q + "].value", frum=frum, schema=schema, many=True, ) ) if len(acc) == 0: return NULL.to_es_script(schema) elif len(acc) == 1: return acc[0] else: return CoalesceOp(acc).to_es_script(schema)
def _convert(v): if v is None: return null_script if v is True: return true_script if v is False: return false_script class_ = v.__class__ if class_ is text_type: return EsScript(type=STRING, expr=quote(v), frum=self, schema=schema) if class_ is int: return EsScript( type=INTEGER, expr=text_type(v), frum=self, schema=schema ) if class_ is float: return EsScript( type=NUMBER, expr=text_type(v), frum=self, schema=schema ) if class_ is data_types: return EsScript( type=OBJECT, expr="[" + ", ".join(quote(k) + ": " + _convert(vv) for k, vv in v.items()) + "]", frum=self, schema=schema, ) if class_ in (FlatList, list, tuple): return EsScript( type=OBJECT, expr="[" + ", ".join(_convert(vv).expr for vv in v) + "]", frum=self, schema=schema, ) if class_ is Date: return EsScript( type=NUMBER, expr=text_type(v.unix), frum=self, schema=schema )
def compileString2Term(edge): if edge.esscript: Log.error("edge script not supported yet") value = edge.value if is_variable_name(value): value = strings.expand_template("getDocValue({{path}})", {"path": quote(value)}) else: Log.error("not handled") def fromTerm(value): return edge.domain.getPartByKey(value) return Data(toTerm={"head": "", "body": value}, fromTerm=fromTerm)
def quote_column(*path): if DEBUG: if not path: Log.error("expecting a name") for p in path: if not is_text(p): Log.error("expecting strings, not {{type}}", type=p.__class__.__name__) try: output = ConcatSQL(SQL_SPACE, JoinSQL(SQL_DOT, [SQL(quote(p)) for p in path]), SQL_SPACE) return output except Exception as e: Log.error("Not expacted", cause=e)
def compileString2Term(edge): if edge.esscript: Log.error("edge script not supported yet") value = edge.value if is_variable_name(value): value = strings.expand_template("getDocValue({{path}})", {"path": quote(value)}) else: Log.error("not handled") def fromTerm(value): return edge.domain.getPartByKey(value) return Data( toTerm={"head": "", "body": value}, fromTerm=fromTerm )
def format(self): value = self.node.value if isinstance(value, str): if self.is_multiline_string: yield '"""' + value + '"""' else: yield quote(value) elif isinstance(value, (float, int)): yield str(value) elif value is None: yield "None" elif isinstance(value, type(...)): yield "..." elif isinstance(value, bytes): yield repr(value) else: Log.error("do not know how to handle {{type}}", type=value.__class__.__name__) format_comment(self.line_comment)
def compile_expression(source): """ THIS FUNCTION IS ON ITS OWN FOR MINIMAL GLOBAL NAMESPACE :param source: PYTHON SOURCE CODE :return: PYTHON FUNCTION """ fake_locals = {} try: exec( """ def output(row, rownum=None, rows=None): _source = """ + strings.quote(source) + """ try: return """ + source + """ except Exception as e: Log.error("Problem with dynamic function {{func|quote}}", func=_source, cause=e) """, globals(), fake_locals) except Exception as e: Log.error("Bad source: {{source}}", source=source, cause=e) return fake_locals['output']
def to_es_script(self, schema, not_null=False, boolean=False, many=True): if is_op(self.expr, Variable_): if self.expr.var == "_id": return EsScript(type=BOOLEAN, expr="false", frum=self, schema=schema) else: columns = schema.leaves(self.expr.var) return (AndOp([ EsScript( type=BOOLEAN, expr="doc[" + quote(c.es_column) + "].empty", frum=self, schema=schema, ) for c in columns ]).partial_eval().to_es_script(schema)) elif is_literal(self.expr): return self.expr.missing().to_es_script(schema) else: return self.expr.missing().partial_eval().to_es_script(schema)
def compile_expression(source, function_name="output"): """ THIS FUNCTION IS ON ITS OWN FOR MINIMAL GLOBAL NAMESPACE :param source: PYTHON SOURCE CODE :param function_name: OPTIONAL NAME TO GIVE TO OUTPUT FUNCTION :return: PYTHON FUNCTION """ fake_locals = {} try: exec( ("def " + function_name + "(row, rownum=None, rows=None):\n" + " _source = " + strings.quote(source) + "\n" + " try:\n" + " return " + source + "\n" + " except Exception as e:\n" + " Log.error(u'Problem with dynamic function {{func|quote}}', func=_source, cause=e)\n" ), GLOBALS, fake_locals, ) except Exception as e: Log.error(u"Bad source: {{source}}", source=source, cause=e) return fake_locals["output"]
def to_python(self, not_null=False, boolean=False, many=False): return ("wrap_leaves({" + ','.join( quote(t['name']) + ":" + t['value'].to_python() for t in self.terms) + "})")
def to_python(self, not_null=False, boolean=False, many=False): return "re.match(" + quote(json2value(self.pattern.json) + "$") + ", " + self.var.to_python() + ")"
def Parts2Term(self, domain): """ TERMS ARE ALWAYS ESCAPED SO THEY CAN BE COMPOUNDED WITH PIPE (|) CONVERT AN ARRAY OF PARTS{name, esfilter} TO AN MVEL EXPRESSION RETURN expression, function PAIR, WHERE expression - MVEL EXPRESSION function - TAKES RESULT OF expression AND RETURNS PART """ fields = domain.dimension.fields term = [] if len(split_field(self.fromData.name)) == 1 and fields: if isinstance(fields, Mapping): # CONVERT UNORDERED FIELD DEFS jx_fields, es_fields = transpose(*[(k, fields[k]) for k in sorted(fields.keys())]) else: jx_fields, es_fields = transpose(*[(i, e) for i, e in enumerate(fields)]) # NO LOOPS BECAUSE QUERY IS SHALLOW # DOMAIN IS FROM A DIMENSION, USE IT'S FIELD DEFS TO PULL if len(es_fields) == 1: def fromTerm(term): return domain.getPartByKey(term) return Data( head="", body='getDocValue('+quote(domain.dimension.fields[0])+')' ), fromTerm else: def fromTerm(term): terms = [convert.pipe2value(t) for t in convert.pipe2value(term).split("|")] candidate = dict(zip(jx_fields, terms)) for p in domain.partitions: for k, t in candidate.items(): if p.value[k] != t: break else: return p if domain.type in ["uid", "default"]: part = {"value": candidate} domain.partitions.append(part) return part else: return Null for f in es_fields: term.append('Value2Pipe(getDocValue('+quote(f)+'))') return Data( head="", body='Value2Pipe('+('+"|"+'.join(term))+')' ), fromTerm else: for v in domain.partitions: term.append("if (" + _where(v.esfilter, lambda x: self._translate(x)) + ") " + value2MVEL(domain.getKey(v)) + "; else ") term.append(value2MVEL(domain.getKey(domain.NULL))) func_name = "_temp"+UID() return self.register_function("+\"|\"+".join(term))
def DataClass(name, columns, constraint=None): """ Use the DataClass to define a class, but with some extra features: 1. restrict the datatype of property 2. restrict if `required`, or if `nulls` are allowed 3. generic constraints on object properties It is expected that this class become a real class (or be removed) in the long term because it is expensive to use and should only be good for verifying program correctness, not user input. :param name: Name of the class we are creating :param columns: Each columns[i] has properties { "name", - (required) name of the property "required", - False if it must be defined (even if None) "nulls", - True if property can be None, or missing "default", - A default value, if none is provided "type" - a Python datatype } :param constraint: a JSON query Expression for extra constraints (return true if all constraints are met) :return: The class that has been created """ columns = wrap( [ {"name": c, "required": True, "nulls": False, "type": object} if is_text(c) else c for c in columns ] ) slots = columns.name required = wrap( filter(lambda c: c.required and not c.nulls and not c.default, columns) ).name nulls = wrap(filter(lambda c: c.nulls, columns)).name defaults = {c.name: coalesce(c.default, None) for c in columns} types = {c.name: coalesce(c.jx_type, object) for c in columns} code = expand_template( """ from __future__ import unicode_literals from mo_future import is_text, is_binary from collections import Mapping meta = None types_ = {{types}} defaults_ = {{defaults}} class {{class_name}}(Mapping): __slots__ = {{slots}} def _constraint(row, rownum, rows): try: return {{constraint_expr}} except Exception as e: return False def __init__(self, **kwargs): if not kwargs: return for s in {{slots}}: object.__setattr__(self, s, kwargs.get(s, {{defaults}}.get(s, None))) missed = {{required}}-set(kwargs.keys()) if missed: Log.error("Expecting properties {"+"{missed}}", missed=missed) illegal = set(kwargs.keys())-set({{slots}}) if illegal: Log.error("{"+"{names}} are not a valid properties", names=illegal) if not self._constraint(0, [self]): Log.error("constraint not satisfied {"+"{expect}}\\n{"+"{value|indent}}", expect={{constraint}}, value=self) def __getitem__(self, item): return getattr(self, item) def __setitem__(self, item, value): setattr(self, item, value) return self def __setattr__(self, item, value): if item not in {{slots}}: Log.error("{"+"{item|quote}} not valid attribute", item=item) object.__setattr__(self, item, value) if not self._constraint(0, [self]): Log.error("constraint not satisfied {"+"{expect}}\\n{"+"{value|indent}}", expect={{constraint}}, value=self) def __getattr__(self, item): Log.error("{"+"{item|quote}} not valid attribute", item=item) def __hash__(self): return object.__hash__(self) def __eq__(self, other): if isinstance(other, {{class_name}}) and dict(self)==dict(other) and self is not other: Log.error("expecting to be same object") return self is other def __dict__(self): return {k: getattr(self, k) for k in {{slots}}} def items(self): return ((k, getattr(self, k)) for k in {{slots}}) def __copy__(self): _set = object.__setattr__ output = object.__new__({{class_name}}) {{assign}} return output def __iter__(self): return {{slots}}.__iter__() def __len__(self): return {{len_slots}} def __str__(self): return str({{dict}}) """, { "class_name": name, "slots": "(" + (", ".join(quote(s) for s in slots)) + ")", "required": "{" + (", ".join(quote(s) for s in required)) + "}", "nulls": "{" + (", ".join(quote(s) for s in nulls)) + "}", "defaults": Literal(defaults).to_python(), "len_slots": len(slots), "dict": "{" + (", ".join(quote(s) + ": self." + s for s in slots)) + "}", "assign": "; ".join( "_set(output, " + quote(s) + ", self." + s + ")" for s in slots ), "types": "{" + (",".join(quote(k) + ": " + v.__name__ for k, v in types.items())) + "}", "constraint_expr": Python[jx_expression(constraint)].to_python(), "constraint": value2json(constraint), }, ) output = _exec(code, name) register_data(output) return output
def DataClass(name, columns, constraint=None): """ Use the DataClass to define a class, but with some extra features: 1. restrict the datatype of property 2. restrict if `required`, or if `nulls` are allowed 3. generic constraints on object properties It is expected that this class become a real class (or be removed) in the long term because it is expensive to use and should only be good for verifying program correctness, not user input. :param name: Name of the class we are creating :param columns: Each columns[i] has properties { "name", - (required) name of the property "required", - False if it must be defined (even if None) "nulls", - True if property can be None, or missing "default", - A default value, if none is provided "type" - a Python datatype } :param constraint: a JSON query Expression for extra constraints (return true if all constraints are met) :return: The class that has been created """ columns = wrap([{ "name": c, "required": True, "nulls": False, "type": object } if is_text(c) else c for c in columns]) slots = columns.name required = wrap( filter(lambda c: c.required and not c.nulls and not c.default, columns)).name nulls = wrap(filter(lambda c: c.nulls, columns)).name defaults = {c.name: coalesce(c.default, None) for c in columns} types = {c.name: coalesce(c.jx_type, object) for c in columns} code = expand_template( """ from __future__ import unicode_literals from mo_future import is_text, is_binary from collections import Mapping meta = None types_ = {{types}} defaults_ = {{defaults}} class {{class_name}}(Mapping): __slots__ = {{slots}} def _constraint(row, rownum, rows): try: return {{constraint_expr}} except Exception as e: Log.error( "constraint\\n{" + "{code}}\\nnot satisfied {" + "{expect}}\\n{" + "{value|indent}}", code={{constraint_expr|quote}}, expect={{constraint}}, value=row, cause=e ) def __init__(self, **kwargs): if not kwargs: return for s in {{slots}}: object.__setattr__(self, s, kwargs.get(s, {{defaults}}.get(s, None))) missed = {{required}}-set(kwargs.keys()) if missed: Log.error("Expecting properties {"+"{missed}}", missed=missed) illegal = set(kwargs.keys())-set({{slots}}) if illegal: Log.error("{"+"{names}} are not a valid properties", names=illegal) self._constraint(0, [self]) def __getitem__(self, item): return getattr(self, item) def __setitem__(self, item, value): setattr(self, item, value) return self def __setattr__(self, item, value): if item not in {{slots}}: Log.error("{"+"{item|quote}} not valid attribute", item=item) object.__setattr__(self, item, value) self._constraint(0, [self]) def __getattr__(self, item): Log.error("{"+"{item|quote}} not valid attribute", item=item) def __hash__(self): return object.__hash__(self) def __eq__(self, other): if isinstance(other, {{class_name}}) and dict(self)==dict(other) and self is not other: Log.error("expecting to be same object") return self is other def __dict__(self): return {k: getattr(self, k) for k in {{slots}}} def items(self): return ((k, getattr(self, k)) for k in {{slots}}) def __copy__(self): _set = object.__setattr__ output = object.__new__({{class_name}}) {{assign}} return output def __iter__(self): return {{slots}}.__iter__() def __len__(self): return {{len_slots}} def __str__(self): return str({{dict}}) """, { "class_name": name, "slots": "(" + (", ".join(quote(s) for s in slots)) + ")", "required": "{" + (", ".join(quote(s) for s in required)) + "}", "nulls": "{" + (", ".join(quote(s) for s in nulls)) + "}", "defaults": Literal(defaults).to_python(), "len_slots": len(slots), "dict": "{" + (", ".join(quote(s) + ": self." + s for s in slots)) + "}", "assign": "; ".join("_set(output, " + quote(s) + ", self." + s + ")" for s in slots), "types": "{" + (",".join(quote(k) + ": " + v.__name__ for k, v in types.items())) + "}", "constraint_expr": Python[jx_expression(not ENABLE_CONSTRAINTS or constraint)].to_python(), "constraint": value2json(constraint), }, ) output = _exec(code, name) register_data(output) return output
def pretty_json(value): try: if value is False: return "false" elif value is True: return "true" elif is_data(value): try: items = sort_using_key(value.items(), lambda r: r[0]) values = [encode_basestring(k) + PRETTY_COLON + pretty_json(v) for k, v in items if v != None] if not values: return "{}" elif len(values) == 1: return "{" + values[0] + "}" else: return "{\n" + ",\n".join(indent(v) for v in values) + "\n}" except Exception as e: from mo_logs import Log from mo_math import OR if OR(not is_text(k) for k in value.keys()): Log.error( "JSON must have string keys: {{keys}}:", keys=[k for k in value.keys()], cause=e ) Log.error( "problem making dict pretty: keys={{keys}}:", keys=[k for k in value.keys()], cause=e ) elif value in (None, Null): return "null" elif value.__class__ in (binary_type, text_type): if is_binary(value): value = utf82unicode(value) try: return quote(value) except Exception as e: from mo_logs import Log try: Log.note("try explicit convert of string with length {{length}}", length=len(value)) acc = [QUOTE] for c in value: try: try: c2 = ESCAPE_DCT[c] except Exception: c2 = c c3 = text_type(c2) acc.append(c3) except BaseException: pass # Log.warning("odd character {{ord}} found in string. Ignored.", ord= ord(c)}, cause=g) acc.append(QUOTE) output = u"".join(acc) Log.note("return value of length {{length}}", length=len(output)) return output except BaseException as f: Log.warning("can not convert {{type}} to json", type=f.__class__.__name__, cause=f) return "null" elif is_list(value): if not value: return "[]" if ARRAY_MAX_COLUMNS == 1: return "[\n" + ",\n".join([indent(pretty_json(v)) for v in value]) + "\n]" if len(value) == 1: j = pretty_json(value[0]) if j.find("\n") >= 0: return "[\n" + indent(j) + "\n]" else: return "[" + j + "]" js = [pretty_json(v) for v in value] max_len = max(*[len(j) for j in js]) if max_len <= ARRAY_ITEM_MAX_LENGTH and max(*[j.find("\n") for j in js]) == -1: # ALL TINY VALUES num_columns = max(1, min(ARRAY_MAX_COLUMNS, int(floor((ARRAY_ROW_LENGTH + 2.0) / float(max_len + 2))))) # +2 TO COMPENSATE FOR COMMAS if len(js) <= num_columns: # DO NOT ADD \n IF ONLY ONE ROW return "[" + PRETTY_COMMA.join(js) + "]" if num_columns == 1: # DO NOT rjust IF THERE IS ONLY ONE COLUMN return "[\n" + ",\n".join([indent(pretty_json(v)) for v in value]) + "\n]" content = ",\n".join( PRETTY_COMMA.join(j.rjust(max_len) for j in js[r:r + num_columns]) for r in xrange(0, len(js), num_columns) ) return "[\n" + indent(content) + "\n]" pretty_list = js output = ["[\n"] for i, p in enumerate(pretty_list): try: if i > 0: output.append(",\n") output.append(indent(p)) except Exception: from mo_logs import Log Log.warning("problem concatenating string of length {{len1}} and {{len2}}", len1=len("".join(output)), len2=len(p) ) output.append("\n]") try: return "".join(output) except Exception as e: from mo_logs import Log Log.error("not expected", cause=e) elif hasattr(value, '__data__'): d = value.__data__() return pretty_json(d) elif hasattr(value, '__json__'): j = value.__json__() if j == None: return " null " # TODO: FIND OUT WHAT CAUSES THIS return pretty_json(json_decoder(j)) elif scrub(value) is None: return "null" elif hasattr(value, '__iter__'): return pretty_json(list(value)) elif hasattr(value, '__call__'): return "null" else: try: if int(value) == value: return text_type(int(value)) except Exception: pass try: if float(value) == value: return text_type(float(value)) except Exception: pass return pypy_json_encode(value) except Exception as e: problem_serializing(value, e)
def typed_encode(value, sub_schema, path, net_new_properties, buffer): """ :param value: THE DATA STRUCTURE TO ENCODE :param sub_schema: dict FROM PATH TO Column DESCRIBING THE TYPE :param path: list OF CURRENT PATH :param net_new_properties: list FOR ADDING NEW PROPERTIES NOT FOUND IN sub_schema :param buffer: UnicodeBuilder OBJECT :return: """ try: # from jx_base import Column if sub_schema.__class__.__name__=='Column': value_json_type = python_type_to_json_type[value.__class__] column_json_type = es_type_to_json_type[sub_schema.es_type] if value_json_type == column_json_type: pass # ok elif value_json_type == NESTED and all(python_type_to_json_type[v.__class__] == column_json_type for v in value if v != None): pass # empty arrays can be anything else: from mo_logs import Log Log.error("Can not store {{value}} in {{column|quote}}", value=value, column=sub_schema.name) sub_schema = {json_type_to_inserter_type[value_json_type]: sub_schema} if value == None: from mo_logs import Log Log.error("can not encode null (missing) values") elif value is True: if BOOLEAN_TYPE not in sub_schema: sub_schema[BOOLEAN_TYPE] = {} net_new_properties.append(path + [BOOLEAN_TYPE]) append(buffer, '{') append(buffer, QUOTED_BOOLEAN_TYPE) append(buffer, 'true}') return elif value is False: if BOOLEAN_TYPE not in sub_schema: sub_schema[BOOLEAN_TYPE] = {} net_new_properties.append(path + [BOOLEAN_TYPE]) append(buffer, '{') append(buffer, QUOTED_BOOLEAN_TYPE) append(buffer, 'false}') return _type = value.__class__ if _type in (dict, Data): if sub_schema.__class__.__name__ == 'Column': from mo_logs import Log Log.error("Can not handle {{column|json}}", column=sub_schema) if NESTED_TYPE in sub_schema: # PREFER NESTED, WHEN SEEN BEFORE if value: append(buffer, '{') append(buffer, QUOTED_NESTED_TYPE) append(buffer, '[') _dict2json(value, sub_schema[NESTED_TYPE], path + [NESTED_TYPE], net_new_properties, buffer) append(buffer, ']' + COMMA) append(buffer, QUOTED_EXISTS_TYPE) append(buffer, text_type(len(value))) append(buffer, '}') else: # SINGLETON LIST append(buffer, '{') append(buffer, QUOTED_NESTED_TYPE) append(buffer, '[{') append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '1}]') append(buffer, COMMA) append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '1}') else: if EXISTS_TYPE not in sub_schema: sub_schema[EXISTS_TYPE] = {} net_new_properties.append(path + [EXISTS_TYPE]) if value: _dict2json(value, sub_schema, path, net_new_properties, buffer) else: append(buffer, '{') append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '1}') elif _type is binary_type: if STRING_TYPE not in sub_schema: sub_schema[STRING_TYPE] = True net_new_properties.append(path + [STRING_TYPE]) append(buffer, '{') append(buffer, QUOTED_STRING_TYPE) append(buffer, '"') try: v = utf82unicode(value) except Exception as e: raise problem_serializing(value, e) for c in v: append(buffer, ESCAPE_DCT.get(c, c)) append(buffer, '"}') elif _type is text_type: if STRING_TYPE not in sub_schema: sub_schema[STRING_TYPE] = True net_new_properties.append(path + [STRING_TYPE]) append(buffer, '{') append(buffer, QUOTED_STRING_TYPE) append(buffer, '"') for c in value: append(buffer, ESCAPE_DCT.get(c, c)) append(buffer, '"}') elif _type in (int, long): if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, text_type(value)) append(buffer, '}') elif _type in (float, Decimal): if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, float2json(value)) append(buffer, '}') elif _type in (set, list, tuple, FlatList): if len(value) == 0: append(buffer, '{') append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '0}') elif any(v.__class__ in (Data, dict, set, list, tuple, FlatList) for v in value): # THIS IS NOT DONE BECAUSE if len(value) == 1: if NESTED_TYPE in sub_schema: append(buffer, '{') append(buffer, QUOTED_NESTED_TYPE) _list2json(value, sub_schema[NESTED_TYPE], path + [NESTED_TYPE], net_new_properties, buffer) append(buffer, '}') else: # NO NEED TO NEST, SO DO NOT DO IT typed_encode(value[0], sub_schema, path, net_new_properties, buffer) else: if NESTED_TYPE not in sub_schema: sub_schema[NESTED_TYPE] = {} net_new_properties.append(path + [NESTED_TYPE]) append(buffer, '{') append(buffer, QUOTED_NESTED_TYPE) _list2json(value, sub_schema[NESTED_TYPE], path + [NESTED_TYPE], net_new_properties, buffer) append(buffer, '}') else: # ALLOW PRIMITIVE MULTIVALUES value = [v for v in value if v != None] types = list(set(json_type_to_inserter_type[python_type_to_json_type[v.__class__]] for v in value)) if len(types) == 0: # HANDLE LISTS WITH Nones IN THEM append(buffer, '{') append(buffer, QUOTED_NESTED_TYPE) append(buffer, '[]}') elif len(types) > 1: _list2json(value, sub_schema, path + [NESTED_TYPE], net_new_properties, buffer) else: element_type = types[0] if element_type not in sub_schema: sub_schema[element_type] = True net_new_properties.append(path + [element_type]) append(buffer, '{') append(buffer, quote(element_type)) append(buffer, COLON) _multivalue2json(value, sub_schema[element_type], path + [element_type], net_new_properties, buffer) append(buffer, '}') elif _type is date: if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, float2json(time.mktime(value.timetuple()))) append(buffer, '}') elif _type is datetime: if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, float2json(time.mktime(value.timetuple()))) append(buffer, '}') elif _type is Date: if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, float2json(value.unix)) append(buffer, '}') elif _type is timedelta: if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, float2json(value.total_seconds())) append(buffer, '}') elif _type is Duration: if NUMBER_TYPE not in sub_schema: sub_schema[NUMBER_TYPE] = True net_new_properties.append(path + [NUMBER_TYPE]) append(buffer, '{') append(buffer, QUOTED_NUMBER_TYPE) append(buffer, float2json(value.seconds)) append(buffer, '}') elif _type is NullType: append(buffer, 'null') elif hasattr(value, '__data__'): typed_encode(value.__data__(), sub_schema, path, net_new_properties, buffer) elif hasattr(value, '__iter__'): if NESTED_TYPE not in sub_schema: sub_schema[NESTED_TYPE] = {} net_new_properties.append(path + [NESTED_TYPE]) append(buffer, '{') append(buffer, QUOTED_NESTED_TYPE) _iter2json(value, sub_schema[NESTED_TYPE], path + [NESTED_TYPE], net_new_properties, buffer) append(buffer, '}') else: from mo_logs import Log Log.error(text_type(repr(value)) + " is not JSON serializable") except Exception as e: from mo_logs import Log Log.error(text_type(repr(value)) + " is not JSON serializable", cause=e)
def string2quote(value): if value == None: return "None" return quote(value)
def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 multi = 1 else: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}} }, "size": 0 }) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '1}') TYPE_PREFIX = "~" # u'\u0442\u0443\u0440\u0435-' # "туре" BOOLEAN_TYPE = TYPE_PREFIX + "b~" NUMBER_TYPE = TYPE_PREFIX + "n~" STRING_TYPE = TYPE_PREFIX + "s~" NESTED_TYPE = TYPE_PREFIX + "N~" EXISTS_TYPE = TYPE_PREFIX + "e~" append = UnicodeBuilder.append QUOTED_BOOLEAN_TYPE = quote(BOOLEAN_TYPE) + COLON QUOTED_NUMBER_TYPE = quote(NUMBER_TYPE) + COLON QUOTED_STRING_TYPE = quote(STRING_TYPE) + COLON QUOTED_NESTED_TYPE = quote(NESTED_TYPE) + COLON QUOTED_EXISTS_TYPE = quote(EXISTS_TYPE) + COLON inserter_type_to_json_type = { BOOLEAN_TYPE: BOOLEAN, NUMBER_TYPE: NUMBER, STRING_TYPE: STRING } json_type_to_inserter_type = { BOOLEAN: BOOLEAN_TYPE, INTEGER: NUMBER_TYPE, NUMBER: NUMBER_TYPE,
def pretty_json(value): try: if value is False: return "false" elif value is True: return "true" elif isinstance(value, Mapping): try: items = sort_using_key(list(value.items()), lambda r: r[0]) values = [ encode_basestring(k) + PRETTY_COLON + indent(pretty_json(v)).strip() for k, v in items if v != None ] if not values: return "{}" elif len(values) == 1: return "{" + values[0] + "}" else: return "{\n" + INDENT + (",\n" + INDENT).join(values) + "\n}" except Exception as e: from mo_logs import Log from mo_math import OR if OR(not isinstance(k, text_type) for k in value.keys()): Log.error("JSON must have string keys: {{keys}}:", keys=[k for k in value.keys()], cause=e) Log.error("problem making dict pretty: keys={{keys}}:", keys=[k for k in value.keys()], cause=e) elif value in (None, Null): return "null" elif isinstance(value, (text_type, binary_type)): if isinstance(value, binary_type): value = utf82unicode(value) try: return quote(value) except Exception as e: from mo_logs import Log try: Log.note( "try explicit convert of string with length {{length}}", length=len(value)) acc = [QUOTE] for c in value: try: try: c2 = ESCAPE_DCT[c] except Exception: c2 = c c3 = text_type(c2) acc.append(c3) except BaseException: pass # Log.warning("odd character {{ord}} found in string. Ignored.", ord= ord(c)}, cause=g) acc.append(QUOTE) output = u"".join(acc) Log.note("return value of length {{length}}", length=len(output)) return output except BaseException as f: Log.warning("can not even explicit convert {{type}}", type=f.__class__.__name__, cause=f) return "null" elif isinstance(value, list): if not value: return "[]" if ARRAY_MAX_COLUMNS == 1: return "[\n" + ",\n".join( [indent(pretty_json(v)) for v in value]) + "\n]" if len(value) == 1: j = pretty_json(value[0]) if j.find("\n") >= 0: return "[\n" + indent(j) + "\n]" else: return "[" + j + "]" js = [pretty_json(v) for v in value] max_len = max(*[len(j) for j in js]) if max_len <= ARRAY_ITEM_MAX_LENGTH and max( *[j.find("\n") for j in js]) == -1: # ALL TINY VALUES num_columns = max( 1, min( ARRAY_MAX_COLUMNS, int( floor((ARRAY_ROW_LENGTH + 2.0) / float(max_len + 2))))) # +2 TO COMPENSATE FOR COMMAS if len(js) <= num_columns: # DO NOT ADD \n IF ONLY ONE ROW return "[" + PRETTY_COMMA.join(js) + "]" if num_columns == 1: # DO NOT rjust IF THERE IS ONLY ONE COLUMN return "[\n" + ",\n".join( [indent(pretty_json(v)) for v in value]) + "\n]" content = ",\n".join( PRETTY_COMMA.join( j.rjust(max_len) for j in js[r:r + num_columns]) for r in xrange(0, len(js), num_columns)) return "[\n" + indent(content) + "\n]" pretty_list = js output = ["[\n"] for i, p in enumerate(pretty_list): try: if i > 0: output.append(",\n") output.append(indent(p)) except Exception: from mo_logs import Log Log.warning( "problem concatenating string of length {{len1}} and {{len2}}", len1=len("".join(output)), len2=len(p)) output.append("\n]") try: return "".join(output) except Exception as e: from mo_logs import Log Log.error("not expected", cause=e) elif hasattr(value, '__data__'): d = value.__data__() return pretty_json(d) elif hasattr(value, '__json__'): j = value.__json__() if j == None: return " null " # TODO: FIND OUT WHAT CAUSES THIS return pretty_json(json_decoder(j)) elif scrub(value) is None: return "null" elif hasattr(value, '__iter__'): return pretty_json(list(value)) elif hasattr(value, '__call__'): return "null" else: try: if int(value) == value: return text_type(int(value)) except Exception: pass try: if float(value) == value: return text_type(float(value)) except Exception: pass return pypy_json_encode(value) except Exception as e: problem_serializing(value, e)
def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta=None): show_detail = True test = unwrap(test) expected = unwrap(expected) try: if test is None and (is_null_op(expected) or expected is None): return elif test is expected: return elif is_text(expected): assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta) elif isinstance(test, UniqueIndex): if test ^ expected: Log.error("Sets do not match") elif is_data(expected) and is_data(test): for k, e in unwrap(expected).items(): t = test.get(k) assertAlmostEqual(t, e, msg=coalesce(msg, "") + "key " + quote(k) + ": ", digits=digits, places=places, delta=delta) elif is_data(expected): if is_many(test): test = list(test) if len(test) != 1: Log.error("Expecting data, not a list") test = test[0] for k, e in expected.items(): if is_text(k): t = mo_dots.get_attr(test, literal_field(k)) else: t = test[k] assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta) elif is_container(test) and isinstance(expected, set): test = set(to_data(t) for t in test) if len(test) != len(expected): Log.error( "Sets do not match, element count different:\n{{test|json|indent}}\nexpecting{{expectedtest|json|indent}}", test=test, expected=expected) for e in expected: for t in test: try: assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta) break except Exception as _: pass else: Log.error( "Sets do not match. {{value|json}} not found in {{test|json}}", value=e, test=test) elif isinstance(expected, types.FunctionType): return expected(test) elif hasattr(test, "__iter__") and hasattr(expected, "__iter__"): if test.__class__.__name__ == "ndarray": # numpy test = test.tolist() elif test.__class__.__name__ == "DataFrame": # pandas test = test[test.columns[0]].values.tolist() elif test.__class__.__name__ == "Series": # pandas test = test.values.tolist() if not expected and test == None: return if expected == None: expected = [] # REPRESENT NOTHING for t, e in zip_longest(test, expected): assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta) else: assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta) except Exception as e: Log.error( "{{test|json|limit(10000)}} does not match expected {{expected|json|limit(10000)}}", test=test if show_detail else "[can not show]", expected=expected if show_detail else "[can not show]", cause=e)
def append_query(self, query_path, es_query): es_field = first(self.query.frum.schema.leaves(self.var)).es_column return Aggs().add(TermsAggs("_match", { "script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'}) }, self).add(es_query))
def to_python(self, not_null=False, boolean=False, many=False): return "((" + quote(self.substring) + " in " + self.var.to_python( ) + ") if " + self.var.to_python() + "!=None else False)"
def _where(esFilter, _translate): if not esFilter or esFilter is True: return "true" keys = esFilter.keys() if len(keys) != 1: Log.error("Expecting only one filter aggregate") op = keys[0] if op == "and": list = esFilter[op] if not (list): return "true" if len(list) == 1: return _where(list[0], _translate) output = "(" + " && ".join(_where(l, _translate) for l in list) + ")" return output elif op == "or": list = esFilter[op] if not list: return "false" if len(list) == 1: return _where(list[0], _translate) output = "(" + " || ".join(_where(l, _translate) for l in list) + ")" return output elif op == "not": return "!(" + _where(esFilter[op, _translate]) + ")" elif op == "term": pair = esFilter[op] if len(pair.keys()) == 1: return [_translate(k) + "==" + value2MVEL(v) for k, v in pair.items()][0] else: return "(" + " && ".join(_translate(k) + "==" + value2MVEL(v) for k, v in pair.items()) + ")" elif op == "terms": output = [] for variableName, valueList in esFilter[op].items(): if not valueList: Log.error("Expecting something in 'terms' array") if len(valueList) == 1: output.append(_translate(variableName) + "==" + value2MVEL(valueList[0])) else: output.append("(" + " || ".join(_translate(variableName) + "==" + value2MVEL(v) for v in valueList) + ")") return " && ".join(output) elif op == "exists": # "exists":{"field":"myField"} pair = esFilter[op] variableName = pair.field return "(" + _translate(variableName) + "!=null)" elif op == "missing": fieldName = _translate(esFilter[op].field) testExistence = coalesce(esFilter[op].existence, True) testNull = coalesce(esFilter[op].null_value, True) output = [] if testExistence and not testNull: output.append("(" + fieldName.replace(".?", ".") + " == empty)") # REMOVE THE .? SO WE REFER TO THE FIELD, NOT GET THE VALUE if testNull: output.append("(" + fieldName + "==null)") return " || ".join(output) elif op == "range": pair = esFilter[op] ranges = [] for variableName, r in pair.items(): if r.gte: ranges.append(value2MVEL(r.gte) + "<=" + _translate(variableName)) elif r.gt: ranges.append(value2MVEL(r.gt) + "<" + _translate(variableName)) elif r["from"]: if r.include_lower == None or r.include_lower: ranges.append(value2MVEL(r["from"]) + "<=" + _translate(variableName)) else: ranges.append(value2MVEL(r["from"]) + "<" + _translate(variableName)) if r.lte: ranges.append(value2MVEL(r.lte) + ">=" + _translate(variableName)) elif r.lt: ranges.append(value2MVEL(r.lt) + ">" + _translate(variableName)) elif r["from"]: if r.include_lower == None or r.include_lower: ranges.append(value2MVEL(r["from"]) + ">=" + _translate(variableName)) else: ranges.append(value2MVEL(r["from"]) + ">" + _translate(variableName)) return "("+" && ".join(ranges)+")" elif op == "script": script = esFilter[op].script return _translate(script) elif op == "prefix": pair = esFilter[op] variableName, value = pair.items()[0] return _translate(variableName) + ".startsWith(" + quote(value) + ")" elif op == "match_all": return "true" else: Log.error("'" + op + "' is an unknown aggregate") return ""
def es_aggsop(es, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY schema = frum.schema select = listwrap(query.select) es_query = Data() new_select = Data() # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": if schema.query_path == ".": s.pull = jx_expression_to_function("doc_count") else: s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]}) elif isinstance(s.value, Variable): if s.aggregate == "count": new_select["count_"+literal_field(s.value.var)] += [s] else: new_select[literal_field(s.value.var)] += [s] elif s.aggregate: formula.append(s) for canonical_name, many in new_select.items(): for s in many: columns = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] for column in columns: cn = literal_field(column.es_column + "_count") if column.jx_type == EXISTS: canonical_names.append(cn + ".doc_count") es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}} else: canonical_names.append(cn+ ".value") es_query.aggs[cn].value_count.field = column.es_column if len(canonical_names) == 1: s.pull = jx_expression_to_function(canonical_names[0]) else: s.pull = jx_expression_to_function({"add": canonical_names}) elif s.aggregate == "median": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [percent] es_query.aggs[key].percentiles.tdigest.compression = 2 s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": canonical_names = [] for column in columns: cn = literal_field(column.es_column + "_cardinality") canonical_names.append(cn) es_query.aggs[cn].cardinality.field = column.es_column if len(columns) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0}) elif s.aggregate == "stats": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = columns[0].es_column # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") es_query.aggs[median_name].percentiles.field = columns[0].es_column es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": pulls = [] for column in columns: script = {"scripted_metric": { 'init_script': 'params._agg.terms = new HashSet()', 'map_script': 'for (v in doc['+quote(column.es_column)+'].values) params._agg.terms.add(v);', 'combine_script': 'return params._agg.terms.toArray()', 'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()', }} stats_name = encode_property(column.es_column) if column.nested_path[0] == ".": es_query.aggs[stats_name] = script pulls.append(jx_expression_to_function(stats_name + ".value")) else: es_query.aggs[stats_name] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": script} } pulls.append(jx_expression_to_function(stats_name + "._nested.value")) if len(pulls) == 0: s.pull = NULL elif len(pulls) == 1: s.pull = pulls[0] else: s.pull = lambda row: UNION(p(row) for p in pulls) else: if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") elif len(columns) <1: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function({"null":{}}) else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = columns[0].es_column s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]}) for i, s in enumerate(formula): canonical_name = literal_field(s.name) if isinstance(s.value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" elif s.aggregate in ('max', 'maximum', 'min', 'minimum'): if s.aggregate in ('max', 'maximum'): dir = 1 op = "max" else: dir = -1 op = 'min' nully = TupleOp("tuple", [NULL]*len(s.value.terms)).partial_eval().to_es_script(schema).expr selfy = s.value.partial_eval().to_es_script(schema).expr script = {"scripted_metric": { 'init_script': 'params._agg.best = ' + nully + ';', 'map_script': 'params._agg.best = ' + expand_template(MAX_OF_TUPLE, {"expr1": "params._agg.best", "expr2": selfy, "dir": dir, "op": op}) + ";", 'combine_script': 'return params._agg.best', 'reduce_script': 'return params._aggs.stream().max(' + expand_template(COMPARE_TUPLE, {"dir": dir, "op": op}) + ').get()', }} if schema.query_path[0] == ".": es_query.aggs[canonical_name] = script s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") else: es_query.aggs[canonical_name] = { "nested": {"path": schema.query_path[0]}, "aggs": {"_nested": script} } s.pull = jx_expression_to_function(literal_field(canonical_name) + "._nested.value") else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es_script(schema).script(schema) s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(key + ".value") elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = s.value.to_es_script(schema).script(schema) # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(stats_name + ".buckets.key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate]) es_query.aggs[canonical_name].extended_stats.script = s.value.to_es_script(schema).script(schema) decoders = get_decoders_by_depth(query) start = 0 # <TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = AndOp("and", split_where[1]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( {"nested": {"path": schema.query_path[0]}}, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = AndOp("and", split_where[0]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es_post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", cause=e)
def es_aggsop(es, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY schema = frum.schema select = listwrap(query.select) es_query = Data() new_select = Data() # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": if schema.query_path == ".": s.pull = jx_expression_to_function("doc_count") else: s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]}) elif isinstance(s.value, Variable): if s.aggregate == "count": new_select["count_"+literal_field(s.value.var)] += [s] else: new_select[literal_field(s.value.var)] += [s] elif s.aggregate: formula.append(s) for canonical_name, many in new_select.items(): for s in many: columns = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] for column in columns: cn = literal_field(column.es_column + "_count") if column.jx_type == EXISTS: canonical_names.append(cn + ".doc_count") es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}} else: canonical_names.append(cn+ ".value") es_query.aggs[cn].value_count.field = column.es_column if len(canonical_names) == 1: s.pull = jx_expression_to_function(canonical_names[0]) else: s.pull = jx_expression_to_function({"add": canonical_names}) elif s.aggregate == "median": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": canonical_names = [] for column in columns: cn = literal_field(column.es_column + "_cardinality") canonical_names.append(cn) es_query.aggs[cn].cardinality.field = column.es_column if len(columns) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0}) elif s.aggregate == "stats": if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = columns[0].es_column # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") es_query.aggs[median_name].percentiles.field = columns[0].es_column es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": pulls = [] for column in columns: script = {"scripted_metric": { 'init_script': 'params._agg.terms = new HashSet()', 'map_script': 'for (v in doc['+quote(column.es_column)+'].values) params._agg.terms.add(v)', 'combine_script': 'return params._agg.terms.toArray()', 'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()', }} stats_name = encode_property(column.es_column) if column.nested_path[0] == ".": es_query.aggs[stats_name] = script pulls.append(jx_expression_to_function(stats_name + ".value")) else: es_query.aggs[stats_name] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": script} } pulls.append(jx_expression_to_function(stats_name + "._nested.value")) if len(pulls) == 0: s.pull = NULL elif len(pulls) == 1: s.pull = pulls[0] else: s.pull = lambda row: UNION(p(row) for p in pulls) else: if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") elif len(columns) <1: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function({"null":{}}) else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = columns[0].es_column s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]}) for i, s in enumerate(formula): canonical_name = literal_field(s.name) if isinstance(s.value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" elif s.aggregate in ('max', 'maximum', 'min', 'minimum'): if s.aggregate in ('max', 'maximum'): dir = 1 op = "max" else: dir = -1 op = 'min' nully = TupleOp("tuple", [NULL]*len(s.value.terms)).partial_eval().to_es_script(schema).expr selfy = s.value.partial_eval().to_es_script(schema).expr script = {"scripted_metric": { 'init_script': 'params._agg.best = ' + nully + ';', 'map_script': 'params._agg.best = ' + expand_template(MAX_OF_TUPLE, {"expr1": "params._agg.best", "expr2": selfy, "dir": dir, "op": op}) + ";", 'combine_script': 'return params._agg.best', 'reduce_script': 'return params._aggs.stream().max(' + expand_template(COMPARE_TUPLE, {"dir": dir, "op": op}) + ').get()', }} if schema.query_path[0] == ".": es_query.aggs[canonical_name] = script s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") else: es_query.aggs[canonical_name] = { "nested": {"path": schema.query_path[0]}, "aggs": {"_nested": script} } s.pull = jx_expression_to_function(literal_field(canonical_name) + "._nested.value") else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es_script(schema).script(schema) s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(key + ".value") elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = s.value.to_es_script(schema).script(schema) # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(stats_name + ".buckets.key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate]) es_query.aggs[canonical_name].extended_stats.script = s.value.to_es_script(schema).script(schema) decoders = get_decoders_by_depth(query) start = 0 #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = AndOp("and", split_where[1]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( {"nested": {"path": schema.query_path[0]}}, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = AndOp("and", split_where[0]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es_post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", cause=e)
def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby( self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby( self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.abs_columns if cc.es_column == column.es_column and cc.type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = 1001 multi = 1001 elif column.es_column == "_id": result = self.default_es.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 else: result = self.default_es.post( "/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "size": 0 }) r = result.aggregations.count count = result.hits.total cardinality = coalesce(r.value, r._nested.value, r.doc_count) multi = coalesce(r.multi.value, 1) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=column.es_index, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=column.names["."], parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = any( column.es_index.startswith(t) for t in [TEST_TABLE_PREFIX, TEST_TABLE]) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR with self.meta.columns.locker: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "names.\\.": ".", "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
def quote_table(column): if _no_need_to_quote.match(column): return column return quote(column)