def _normalize_edge(edge, schema=None): if not _Column: _late_import() if edge == None: Log.error("Edge has no value, or expression is empty") elif isinstance(edge, basestring): if schema: try: e = schema[edge] except Exception, e: e = None e = unwraplist(e) if e and not isinstance(e, (_Column, set, list)): if isinstance(e, _Column): return Dict(name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(domain=e, schema=schema)) elif isinstance(e.fields, list) and len(e.fields) == 1: return Dict(name=e.name, value=jx_expression(e.fields[0]), allowNulls=True, domain=e.getDomain()) else: return Dict(name=e.name, allowNulls=True, domain=e.getDomain()) return Dict(name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(schema=schema))
def _normalize_range(range): if range == None: return None return Dict(min=None if range.min == None else jx_expression(range.min), max=None if range.max == None else jx_expression(range.max), mode=range.mode)
def __init__(self, filename, host="fake", index="fake", settings=None): self.settings = settings self.filename = settings.filename try: self.data = convert.json2value(File(self.filename).read()) except Exception: self.data = Dict()
def _convert_from(self, frum): if isinstance(frum, basestring): return Dict(name=frum) elif isinstance(frum, (Container, QueryOp)): return frum else: Log.error("Expecting from clause to be a name, or a container")
def _normalize_selects( selects, frum, schema=None, ): if frum == None or isinstance(frum, (list, set, unicode)): if isinstance(selects, list): if len(selects) == 0: output = Dict() return output else: output = [ _normalize_select_no_context(s, schema=schema) for s in selects ] else: return _normalize_select_no_context(selects) elif isinstance(selects, list): output = [ ss for s in selects for ss in _normalize_select(s, frum=frum, schema=schema) ] else: output = _normalize_select(selects, frum, schema=schema) exists = set() for s in output: if s.name in exists: Log.error("{{name}} has already been defined", name=s.name) exists.add(s.name) return output
def compileDuration2Term(edge): if edge.esscript: Log.error("edge script not supported yet") # IS THERE A LIMIT ON THE DOMAIN? numPartitions = len(edge.domain.partitions) value = edge.value if isKeyword(value): value = "doc[\"" + value + "\"].value" ref = coalesce(edge.domain.min, edge.domain.max, durations.ZERO) nullTest = compileNullTest(edge) ms = edge.domain.interval.milli if edge.domain.interval.month > 0: ms = durations.YEAR.milli / 12 * edge.domain.interval.month partition2int = "Math.floor((" + value + "-" + value2MVEL(ref) + ")/" + ms + ")" partition2int = "((" + nullTest + ") ? " + numPartitions + " : " + partition2int + ")" def int2Partition(value): if Math.round(value) == numPartitions: return edge.domain.NULL return edge.domain.getPartByKey(ref.add(edge.domain.interval.multiply(value))) return Dict(toTerm={"head": "", "body": partition2int}, fromTerm=int2Partition)
def __init__(self, host, index, type=None, alias=None, name=None, port=9200, read_only=True, typed=None, settings=None): Container.__init__(self, None) if not containers.config.default: containers.config.default.settings = settings self.settings = settings self.name = coalesce(name, alias, index) if read_only: self._es = elasticsearch.Alias(alias=coalesce(alias, index), settings=settings) else: self._es = elasticsearch.Cluster(settings=settings).get_index( read_only=read_only, settings=settings) self.meta = FromESMetadata(settings=settings) self.settings.type = self._es.settings.type self.edges = Dict() self.worker = None if typed == None: self._columns = self.get_columns(table_name=index) # SWITCH ON TYPED MODE self.typed = any(c.name in ("$value", "$object") for c in self._columns) else: self.typed = typed
def _convert_select(self, select): if isinstance(select, basestring): return Dict( name=select.rstrip( "." ), # TRAILING DOT INDICATES THE VALUE, BUT IS INVALID FOR THE NAME value=select, aggregate="none") else: select = wrap(select) output = copy(select) if not select.value or isinstance(select.value, basestring): if select.value == ".": output.name = coalesce(select.name, select.aggregate) else: output.name = coalesce(select.name, select.value, select.aggregate) elif not output.name: Log.error("Must give name to each column in select clause") if not output.name: Log.error("expecting select to have a name: {{select}}", select=select) output.aggregate = coalesce( canonical_aggregates.get(select.aggregate), select.aggregate, "none") return output
def dominator(graph, head): # WE WOULD NEED DOMINATORS IF WE DO NOT KNOW THE TOPOLOGICAL ORDERING # DOMINATORS ALLOW US TO USE A REFERENCE TEST RESULT: EVERYTHING BETWEEN # dominator(node) AND node CAN BE TREATED AS PARALLEL-APPLIED CHANGESETS # # INSTEAD OF DOMINATORS, WE COULD USE MANY PERF RESULTS, FROM EACH OF THE # PARENT BRANCHES, AND AS LONG AS THEY ALL ARE PART OF A LONG LINE OF # STATISTICALLY IDENTICAL PERF RESULTS, WE CAN ASSUME THEY ARE A DOMINATOR visited = set() dom = Dict(output=None) def find_dominator(node, path, graph, todo): if dom.output: return False if not todo: dom.output = node return False if node in visited: common = INTERSECT(p[1::] for p in todo) # DO NOT INCLUDE head if node in common: dom.output = node #ALL REMAINING PATHS HAVE node IN COMMON TOO return False return True bfs(graph, find_dominator, head) return dom.output
def __init__(self, settings): self.settings = wrap({"host": "fake", "index": "fake"}) self.filename = settings.filename try: self.data = convert.json2value(File(self.filename).read()) except IOError: self.data = Dict()
def _convert_window(self, window): return Dict( name=coalesce(window.name, window.value), value=window.value, edges=[self._convert_edge(e) for e in listwrap(window.edges)], sort=self._convert_sort(window.sort), aggregate=window.aggregate, range=self._convert_range(window.range), where=self._convert_where(window.where))
def zip(keys, values): """ CONVERT LIST OF KEY/VALUE PAIRS TO A DICT """ output = Dict() for i, k in enumerate(keys): if i >= len(values): break output[k] = values[i] return output
def _parse_properties(index, properties): """ ISOLATE THE DEALING WITH THE INDEX_CACHE, INDEX_CACHE IS REDUNDANT WHEN YOU HAVE metadata.columns """ backup = INDEX_CACHE.get(index) INDEX_CACHE[index] = output = Dict() output.name = index columns = parse_columns(index, properties) INDEX_CACHE[index] = backup return columns
def wrap(query, schema=None): """ NORMALIZE QUERY SO IT CAN STILL BE JSON """ if isinstance(query, QueryOp) or query == None: return query query = wrap(query) output = QueryOp("from", None) output.format = query.format output.frum = wrap_from(query["from"], schema=schema) if not schema and isinstance(output.frum, Schema): schema = output.frum if query.select: output.select = _normalize_selects(query.select, query.frum, schema=schema) else: if query.edges or query.groupby: output.select = Dict(name="count", value=jx_expression("."), aggregate="count", default=0) else: output.select = _normalize_selects(".", query["from"]) if query.groupby and query.edges: Log.error( "You can not use both the `groupby` and `edges` clauses in the same query!" ) elif query.edges: output.edges = _normalize_edges(query.edges, schema=schema) output.groupby = Null elif query.groupby: output.edges = Null output.groupby = _normalize_groupby(query.groupby, schema=schema) else: output.edges = Null output.groupby = Null output.where = _normalize_where(query.where, schema=schema) output.window = [_normalize_window(w) for w in listwrap(query.window)] output.having = None output.sort = _normalize_sort(query.sort) output.limit = Math.min(MAX_LIMIT, coalesce(query.limit, DEFAULT_LIMIT)) if not Math.is_integer(output.limit) or output.limit < 0: Log.error("Expecting limit >= 0") output.isLean = query.isLean return output
def zip(keys, values): """ CONVERT LIST OF KEY/VALUE PAIRS TO A DICT PLEASE `import dot`, AND CALL `dot.zip()` """ output = Dict() for i, k in enumerate(keys): if i >= len(values): break output[k] = values[i] return output
def wrap(v): type_ = _get(v, "__class__") if type_ is dict: m = Dict(v) return m elif type_ is NoneType: return Null elif type_ is list: return DictList(v) elif type_ is GeneratorType: return (wrap(vv) for vv in v) else: return v
def compileNumeric2Term(edge): if edge.script: Log.error("edge script not supported yet") if edge.domain.type != "numeric" and edge.domain.type != "count": Log.error("can only translate numeric domains") numPartitions = len(edge.domain.partitions) value = edge.value if isKeyword(value): value = "doc[\"" + value + "\"].value" if not edge.domain.max: if not edge.domain.min: ref = 0 partition2int = "Math.floor(" + value + ")/" + value2MVEL( edge.domain.interval) + ")" nullTest = "false" else: ref = value2MVEL(edge.domain.min) partition2int = "Math.floor((" + value + "-" + ref + ")/" + value2MVEL( edge.domain.interval) + ")" nullTest = "" + value + "<" + ref elif not edge.domain.min: ref = value2MVEL(edge.domain.max) partition2int = "Math.floor((" + value + "-" + ref + ")/" + value2MVEL( edge.domain.interval) + ")" nullTest = "" + value + ">=" + ref else: top = value2MVEL(edge.domain.max) ref = value2MVEL(edge.domain.min) partition2int = "Math.floor((" + value + "-" + ref + ")/" + value2MVEL( edge.domain.interval) + ")" nullTest = "(" + value + "<" + ref + ") or (" + value + ">=" + top + ")" partition2int = "((" + nullTest + ") ? " + numPartitions + " : " + partition2int + ")" offset = convert.value2int(ref) def int2Partition(value): if Math.round(value) == numPartitions: return edge.domain.NULL return edge.domain.getPartByKey((value * edge.domain.interval) + offset) return Dict(toTerm={ "head": "", "body": partition2int }, fromTerm=int2Partition)
def _convert_edge(self, edge): if isinstance(edge, basestring): return Dict(name=edge, value=edge, domain=self._convert_domain()) else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge=edge) if isinstance(edge.value, (Mapping, list)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain = self._convert_domain() domain.dimension = Dict(fields=edge.value) return Dict( name=edge.name, allowNulls=False if edge.allowNulls is False else True, domain=domain) domain = self._convert_domain(edge.domain) return Dict(name=coalesce(edge.name, edge.value), value=edge.value, range=edge.range, allowNulls=False if edge.allowNulls is False else True, domain=domain)
def compileString2Term(edge): if edge.esscript: Log.error("edge script not supported yet") value = edge.value if isKeyword(value): value = strings.expand_template("getDocValue({{path}})", {"path": convert.string2quote(value)}) else: Log.error("not handled") def fromTerm(value): return edge.domain.getPartByKey(value) return Dict(toTerm={"head": "", "body": value}, fromTerm=fromTerm)
def _normalize_window(window, schema=None): v = window.value try: expr = jx_expression(v) except Exception: expr = ScriptOp("script", v) return Dict( name=coalesce(window.name, window.value), value=expr, edges=[_normalize_edge(e, schema) for e in listwrap(window.edges)], sort=_normalize_sort(window.sort), aggregate=window.aggregate, range=_normalize_range(window.range), where=_normalize_where(window.where, schema=schema))
def compileTime2Term(edge): """ RETURN MVEL CODE THAT MAPS TIME AND DURATION DOMAINS DOWN TO AN INTEGER AND AND THE JAVASCRIPT THAT WILL TURN THAT INTEGER BACK INTO A PARTITION (INCLUDING NULLS) """ if edge.esscript: Log.error("edge script not supported yet") # IS THERE A LIMIT ON THE DOMAIN? numPartitions = len(edge.domain.partitions) value = edge.value if isKeyword(value): value = "doc[\"" + value + "\"].value" nullTest = compileNullTest(edge) ref = coalesce(edge.domain.min, edge.domain.max, datetime(2000, 1, 1)) if edge.domain.interval.month > 0: offset = ref.subtract(ref.floorMonth(), durations.DAY).milli if offset > durations.DAY.milli * 28: offset = ref.subtract(ref.ceilingMonth(), durations.DAY).milli partition2int = "milli2Month(" + value + ", " + value2MVEL( offset) + ")" partition2int = "((" + nullTest + ") ? 0 : " + partition2int + ")" def int2Partition(value): if Math.round(value) == 0: return edge.domain.NULL d = datetime(str(value)[:4:], str(value).right(2), 1) d = d.addMilli(offset) return edge.domain.getPartByKey(d) else: partition2int = "Math.floor((" + value + "-" + value2MVEL( ref) + ")/" + edge.domain.interval.milli + ")" partition2int = "((" + nullTest + ") ? " + numPartitions + " : " + partition2int + ")" def int2Partition(value): if Math.round(value) == numPartitions: return edge.domain.NULL return edge.domain.getPartByKey( ref.add(edge.domain.interval.multiply(value))) return Dict(toTerm={ "head": "", "body": partition2int }, fromTerm=int2Partition)
def _normalize_select_no_context(select, schema=None): """ SAME NORMALIZE, BUT NO SOURCE OF COLUMNS """ if not _Column: _late_import() if isinstance(select, basestring): select = Dict(value=select) else: select = wrap(select) output = select.copy() if not select.value: output.name = coalesce(select.name, select.aggregate) if output.name: output.value = jx_expression(".") else: return output elif isinstance(select.value, basestring): if select.value.endswith(".*"): output.name = coalesce(select.name, select.value[:-2], select.aggregate) output.value = LeavesOp("leaves", Variable(select.value[:-2])) else: if select.value == ".": output.name = coalesce(select.name, select.aggregate, ".") output.value = jx_expression(select.value) elif select.value == "*": output.name = coalesce(select.name, select.aggregate, ".") output.value = LeavesOp("leaves", Variable(".")) else: output.name = coalesce(select.name, select.value, select.aggregate) output.value = jx_expression(select.value) else: output.value = jx_expression(select.value) if not output.name: Log.error("expecting select to have a name: {{select}}", select=select) if output.name.endswith(".*"): Log.error("{{name|quote}} is invalid select", name=output.name) output.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") output.default = coalesce(select.default, canonical_aggregates[output.aggregate].default) return output
def __init__(self, host, index, type=None, alias=None, name=None, port=9200, settings=None): self.settings = settings self.name = coalesce(name, alias, index) self._es = elasticsearch.Alias(alias=coalesce(alias, index), settings=settings) self.settings.type = self._es.settings.type # Alias() WILL ASSIGN A TYPE IF IT WAS MISSING self.edges = Dict() self.worker = None self.ready = False
def _normalize_edge(edge, schema=None): if not _Column: _late_import() if isinstance(edge, basestring): if schema: e = schema[edge] if e: if isinstance(e, _Column): return Dict(name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(schema=schema)) elif isinstance(e.fields, list) and len(e.fields) == 1: return Dict(name=e.name, value=jx_expression(e.fields[0]), allowNulls=True, domain=e.getDomain()) else: return Dict(name=e.name, allowNulls=True, domain=e.getDomain()) return Dict(name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(schema=schema)) else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound and complex edges: {{edge}}", edge=edge) if isinstance(edge.value, (list, set)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain = _normalize_domain(schema=schema) domain.dimension = Dict(fields=edge.value) return Dict(name=edge.name, allowNulls=bool(coalesce(edge.allowNulls, True)), domain=domain) domain = _normalize_domain(edge.domain, schema=schema) return Dict(name=coalesce(edge.name, edge.value), value=jx_expression(edge.value), range=_normalize_range(edge.range), allowNulls=bool(coalesce(edge.allowNulls, True)), domain=domain)
def wrap(v): type_ = _get(v, "__class__") if type_ is dict: m = Dict(v) return m # m = object.__new__(Dict) # object.__setattr__(m, "_dict", v) # return m elif type_ is NoneType: return Null elif type_ is list: return DictList(v) elif type_ is GeneratorType: return (wrap(vv) for vv in v) else: return v
def encrypt(text, _key, salt=None): """ RETURN JSON OF ENCRYPTED DATA {"salt":s, "length":l, "data":d} """ from pyLibrary.queries import jx if not isinstance(text, unicode): Log.error("only unicode is encrypted") if _key is None: Log.error("Expecting a key") if isinstance(_key, str): _key = bytearray(_key) if salt is None: salt = Random.bytes(16) data = bytearray(text.encode("utf8")) # Initialize encryption using key and iv key_expander_256 = key_expander.KeyExpander(256) expanded_key = key_expander_256.expand(_key) aes_cipher_256 = aes_cipher.AESCipher(expanded_key) aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16) aes_cbc_256.set_iv(salt) output = Dict() output.type = "AES256" output.salt = convert.bytes2base64(salt) output.length = len(data) encrypted = bytearray() for _, d in jx.groupby(data, size=16): encrypted.extend(aes_cbc_256.encrypt_block(d)) output.data = convert.bytes2base64(encrypted) json = convert.value2json(output) if DEBUG: test = decrypt(json, _key) if test != text: Log.error("problem with encryption") return json
def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now() - OLD_METADATA self.meta = Dict() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def get_columns(self, _from_name=None): """ ENSURE COLUMNS FOR GIVEN INDEX/QUERY ARE LOADED, SCRIPT COMPILATION WILL WORK BETTER _from_name - NOT MEANT FOR EXTERNAL USE """ if _from_name is None: _from_name = self.name if not isinstance(_from_name, basestring): Log.error("Expecting string") output = INDEX_CACHE.get(_from_name) if output: # VERIFY es IS CONSISTENT if self.url != output.url: Log.error( "Using {{name}} for two different containers\n\t{{existing}}\n\t{{new}}", name=_from_name, existing=output.url, new=self._es.url) return output.columns path = split_field(_from_name) if len(path) > 1: # LOAD THE PARENT (WHICH WILL FILL THE INDEX_CACHE WITH NESTED CHILDREN) self.get_columns(_from_name=path[0]) return INDEX_CACHE[_from_name].columns schema = self._es.get_schema() properties = schema.properties INDEX_CACHE[_from_name] = output = Dict() output.name = _from_name output.url = self._es.url output.columns = parse_columns(_from_name, properties) return output.columns
def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in ["object", "nested"]: Log.error("not supported") try: if c.table == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([ g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": { "eq": { "table": c.table, "es_column": c.es_column } } }) return if c.table == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([ g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": { "eq": { "table": c.table, "name": c.name } } }) return es_index = c.table.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": { c.name: _counting_query(c) }, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count == 0 else None) if cardinality == None: Log.error("logic error") query = Dict(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): Log.note("{{table}}.{{field}} has {{num}} parts", table=c.table, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) return elif c.nested_path: query.aggs[literal_field(c.name)] = { "nested": { "path": listwrap(c.nested_path)[0] }, "aggs": { "_nested": { "terms": { "field": c.es_column, "size": 0 } } } } else: query.aggs[literal_field(c.name)] = { "terms": { "field": c.es_column, "size": 0 } } result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) except Exception, e: if "IndexMissingException" in e and c.table.startswith( TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": { "eq": { "table": c.table, "es_column": c.es_column } } }) Log.warning( "Could not get {{col.table}}.{{col.es_column}} info", col=c, cause=e)
def __getitem__(self, item): if item == "from": return self.frum return Dict.__getitem__(self, item)
from __future__ import division from __future__ import unicode_literals from collections import Mapping from copy import copy from types import GeneratorType from pyLibrary.debugs.logs import Log from pyLibrary.dot import set_default, split_field, wrap, join_field from pyLibrary.dot.dicts import Dict OBJECT = "object" NESTED = "nested" STRUCT = [OBJECT, NESTED] type2container = Dict() config = Dict() # config.default IS EXPECTED TO BE SET BEFORE CALLS ARE MADE _ListContainer = None _Cube = None _run = None _Query = None _Normal = None def _delayed_imports(): global type2container global _ListContainer global _Cube global _run global _Query global _Normal