コード例 #1
0
def _normalize_edge(edge, schema=None):
    if not _Column:
        _late_import()

    if edge == None:
        Log.error("Edge has no value, or expression is empty")
    elif isinstance(edge, basestring):
        if schema:
            try:
                e = schema[edge]
            except Exception, e:
                e = None
            e = unwraplist(e)
            if e and not isinstance(e, (_Column, set, list)):
                if isinstance(e, _Column):
                    return Dict(name=edge,
                                value=jx_expression(edge),
                                allowNulls=True,
                                domain=_normalize_domain(domain=e,
                                                         schema=schema))
                elif isinstance(e.fields, list) and len(e.fields) == 1:
                    return Dict(name=e.name,
                                value=jx_expression(e.fields[0]),
                                allowNulls=True,
                                domain=e.getDomain())
                else:
                    return Dict(name=e.name,
                                allowNulls=True,
                                domain=e.getDomain())
        return Dict(name=edge,
                    value=jx_expression(edge),
                    allowNulls=True,
                    domain=_normalize_domain(schema=schema))
コード例 #2
0
def _normalize_range(range):
    if range == None:
        return None

    return Dict(min=None if range.min == None else jx_expression(range.min),
                max=None if range.max == None else jx_expression(range.max),
                mode=range.mode)
コード例 #3
0
ファイル: elasticsearch.py プロジェクト: davehunt/ActiveData
 def __init__(self, filename, host="fake", index="fake", settings=None):
     self.settings = settings
     self.filename = settings.filename
     try:
         self.data = convert.json2value(File(self.filename).read())
     except Exception:
         self.data = Dict()
コード例 #4
0
 def _convert_from(self, frum):
     if isinstance(frum, basestring):
         return Dict(name=frum)
     elif isinstance(frum, (Container, QueryOp)):
         return frum
     else:
         Log.error("Expecting from clause to be a name, or a container")
コード例 #5
0
def _normalize_selects(
    selects,
    frum,
    schema=None,
):
    if frum == None or isinstance(frum, (list, set, unicode)):
        if isinstance(selects, list):
            if len(selects) == 0:
                output = Dict()
                return output
            else:
                output = [
                    _normalize_select_no_context(s, schema=schema)
                    for s in selects
                ]
        else:
            return _normalize_select_no_context(selects)
    elif isinstance(selects, list):
        output = [
            ss for s in selects
            for ss in _normalize_select(s, frum=frum, schema=schema)
        ]
    else:
        output = _normalize_select(selects, frum, schema=schema)

    exists = set()
    for s in output:
        if s.name in exists:
            Log.error("{{name}} has already been defined", name=s.name)
        exists.add(s.name)
    return output
コード例 #6
0
ファイル: util.py プロジェクト: davehunt/ActiveData
def compileDuration2Term(edge):
    if edge.esscript:
        Log.error("edge script not supported yet")

    # IS THERE A LIMIT ON THE DOMAIN?
    numPartitions = len(edge.domain.partitions)
    value = edge.value
    if isKeyword(value):
        value = "doc[\"" + value + "\"].value"

    ref = coalesce(edge.domain.min, edge.domain.max, durations.ZERO)
    nullTest = compileNullTest(edge)

    ms = edge.domain.interval.milli
    if edge.domain.interval.month > 0:
        ms = durations.YEAR.milli / 12 * edge.domain.interval.month

    partition2int = "Math.floor((" + value + "-" + value2MVEL(ref) + ")/" + ms + ")"
    partition2int = "((" + nullTest + ") ? " + numPartitions + " : " + partition2int + ")"

    def int2Partition(value):
        if Math.round(value) == numPartitions:
            return edge.domain.NULL
        return edge.domain.getPartByKey(ref.add(edge.domain.interval.multiply(value)))

    return Dict(toTerm={"head": "", "body": partition2int}, fromTerm=int2Partition)
コード例 #7
0
    def __init__(self,
                 host,
                 index,
                 type=None,
                 alias=None,
                 name=None,
                 port=9200,
                 read_only=True,
                 typed=None,
                 settings=None):
        Container.__init__(self, None)
        if not containers.config.default:
            containers.config.default.settings = settings
        self.settings = settings
        self.name = coalesce(name, alias, index)
        if read_only:
            self._es = elasticsearch.Alias(alias=coalesce(alias, index),
                                           settings=settings)
        else:
            self._es = elasticsearch.Cluster(settings=settings).get_index(
                read_only=read_only, settings=settings)

        self.meta = FromESMetadata(settings=settings)
        self.settings.type = self._es.settings.type
        self.edges = Dict()
        self.worker = None
        if typed == None:
            self._columns = self.get_columns(table_name=index)
            # SWITCH ON TYPED MODE
            self.typed = any(c.name in ("$value", "$object")
                             for c in self._columns)
        else:
            self.typed = typed
コード例 #8
0
    def _convert_select(self, select):
        if isinstance(select, basestring):
            return Dict(
                name=select.rstrip(
                    "."
                ),  # TRAILING DOT INDICATES THE VALUE, BUT IS INVALID FOR THE NAME
                value=select,
                aggregate="none")
        else:
            select = wrap(select)
            output = copy(select)
            if not select.value or isinstance(select.value, basestring):
                if select.value == ".":
                    output.name = coalesce(select.name, select.aggregate)
                else:
                    output.name = coalesce(select.name, select.value,
                                           select.aggregate)
            elif not output.name:
                Log.error("Must give name to each column in select clause")

            if not output.name:
                Log.error("expecting select to have a name: {{select}}",
                          select=select)

            output.aggregate = coalesce(
                canonical_aggregates.get(select.aggregate), select.aggregate,
                "none")
            return output
コード例 #9
0
def dominator(graph, head):
    # WE WOULD NEED DOMINATORS IF WE DO NOT KNOW THE TOPOLOGICAL ORDERING
    # DOMINATORS ALLOW US TO USE A REFERENCE TEST RESULT: EVERYTHING BETWEEN
    # dominator(node) AND node CAN BE TREATED AS PARALLEL-APPLIED CHANGESETS
    #
    # INSTEAD OF DOMINATORS, WE COULD USE MANY PERF RESULTS, FROM EACH OF THE
    # PARENT BRANCHES, AND AS LONG AS THEY ALL ARE PART OF A LONG LINE OF
    # STATISTICALLY IDENTICAL PERF RESULTS, WE CAN ASSUME THEY ARE A DOMINATOR

    visited = set()
    dom = Dict(output=None)

    def find_dominator(node, path, graph, todo):
        if dom.output:
            return False
        if not todo:
            dom.output = node
            return False
        if node in visited:
            common = INTERSECT(p[1::] for p in todo)  # DO NOT INCLUDE head
            if node in common:
                dom.output = node  #ALL REMAINING PATHS HAVE node IN COMMON TOO
            return False
        return True

    bfs(graph, find_dominator, head)

    return dom.output
コード例 #10
0
 def __init__(self, settings):
     self.settings = wrap({"host": "fake", "index": "fake"})
     self.filename = settings.filename
     try:
         self.data = convert.json2value(File(self.filename).read())
     except IOError:
         self.data = Dict()
コード例 #11
0
 def _convert_window(self, window):
     return Dict(
         name=coalesce(window.name, window.value),
         value=window.value,
         edges=[self._convert_edge(e) for e in listwrap(window.edges)],
         sort=self._convert_sort(window.sort),
         aggregate=window.aggregate,
         range=self._convert_range(window.range),
         where=self._convert_where(window.where))
コード例 #12
0
def zip(keys, values):
    """
    CONVERT LIST OF KEY/VALUE PAIRS TO A DICT
    """
    output = Dict()
    for i, k in enumerate(keys):
        if i >= len(values):
            break
        output[k] = values[i]
    return output
コード例 #13
0
def _parse_properties(index, properties):
    """
    ISOLATE THE DEALING WITH THE INDEX_CACHE,
    INDEX_CACHE IS REDUNDANT WHEN YOU HAVE metadata.columns
    """
    backup = INDEX_CACHE.get(index)
    INDEX_CACHE[index] = output = Dict()
    output.name = index
    columns = parse_columns(index, properties)
    INDEX_CACHE[index] = backup
    return columns
コード例 #14
0
ファイル: query.py プロジェクト: klahnakoski/MoTreeherder
    def wrap(query, schema=None):
        """
        NORMALIZE QUERY SO IT CAN STILL BE JSON
        """
        if isinstance(query, QueryOp) or query == None:
            return query

        query = wrap(query)

        output = QueryOp("from", None)
        output.format = query.format
        output.frum = wrap_from(query["from"], schema=schema)
        if not schema and isinstance(output.frum, Schema):
            schema = output.frum

        if query.select:
            output.select = _normalize_selects(query.select,
                                               query.frum,
                                               schema=schema)
        else:
            if query.edges or query.groupby:
                output.select = Dict(name="count",
                                     value=jx_expression("."),
                                     aggregate="count",
                                     default=0)
            else:
                output.select = _normalize_selects(".", query["from"])

        if query.groupby and query.edges:
            Log.error(
                "You can not use both the `groupby` and `edges` clauses in the same query!"
            )
        elif query.edges:
            output.edges = _normalize_edges(query.edges, schema=schema)
            output.groupby = Null
        elif query.groupby:
            output.edges = Null
            output.groupby = _normalize_groupby(query.groupby, schema=schema)
        else:
            output.edges = Null
            output.groupby = Null

        output.where = _normalize_where(query.where, schema=schema)
        output.window = [_normalize_window(w) for w in listwrap(query.window)]
        output.having = None
        output.sort = _normalize_sort(query.sort)
        output.limit = Math.min(MAX_LIMIT, coalesce(query.limit,
                                                    DEFAULT_LIMIT))
        if not Math.is_integer(output.limit) or output.limit < 0:
            Log.error("Expecting limit >= 0")

        output.isLean = query.isLean

        return output
コード例 #15
0
def zip(keys, values):
    """
    CONVERT LIST OF KEY/VALUE PAIRS TO A DICT
    PLEASE `import dot`, AND CALL `dot.zip()`
    """
    output = Dict()
    for i, k in enumerate(keys):
        if i >= len(values):
            break
        output[k] = values[i]
    return output
コード例 #16
0
def wrap(v):
    type_ = _get(v, "__class__")

    if type_ is dict:
        m = Dict(v)
        return m
    elif type_ is NoneType:
        return Null
    elif type_ is list:
        return DictList(v)
    elif type_ is GeneratorType:
        return (wrap(vv) for vv in v)
    else:
        return v
コード例 #17
0
def compileNumeric2Term(edge):
    if edge.script:
        Log.error("edge script not supported yet")

    if edge.domain.type != "numeric" and edge.domain.type != "count":
        Log.error("can only translate numeric domains")

    numPartitions = len(edge.domain.partitions)
    value = edge.value
    if isKeyword(value):
        value = "doc[\"" + value + "\"].value"

    if not edge.domain.max:
        if not edge.domain.min:
            ref = 0
            partition2int = "Math.floor(" + value + ")/" + value2MVEL(
                edge.domain.interval) + ")"
            nullTest = "false"
        else:
            ref = value2MVEL(edge.domain.min)
            partition2int = "Math.floor((" + value + "-" + ref + ")/" + value2MVEL(
                edge.domain.interval) + ")"
            nullTest = "" + value + "<" + ref
    elif not edge.domain.min:
        ref = value2MVEL(edge.domain.max)
        partition2int = "Math.floor((" + value + "-" + ref + ")/" + value2MVEL(
            edge.domain.interval) + ")"
        nullTest = "" + value + ">=" + ref
    else:
        top = value2MVEL(edge.domain.max)
        ref = value2MVEL(edge.domain.min)
        partition2int = "Math.floor((" + value + "-" + ref + ")/" + value2MVEL(
            edge.domain.interval) + ")"
        nullTest = "(" + value + "<" + ref + ") or (" + value + ">=" + top + ")"

    partition2int = "((" + nullTest + ") ? " + numPartitions + " : " + partition2int + ")"
    offset = convert.value2int(ref)

    def int2Partition(value):
        if Math.round(value) == numPartitions:
            return edge.domain.NULL
        return edge.domain.getPartByKey((value * edge.domain.interval) +
                                        offset)

    return Dict(toTerm={
        "head": "",
        "body": partition2int
    },
                fromTerm=int2Partition)
コード例 #18
0
    def _convert_edge(self, edge):
        if isinstance(edge, basestring):
            return Dict(name=edge, value=edge, domain=self._convert_domain())
        else:
            edge = wrap(edge)
            if not edge.name and not isinstance(edge.value, basestring):
                Log.error("You must name compound edges: {{edge}}", edge=edge)

            if isinstance(edge.value, (Mapping, list)) and not edge.domain:
                # COMPLEX EDGE IS SHORT HAND
                domain = self._convert_domain()
                domain.dimension = Dict(fields=edge.value)

                return Dict(
                    name=edge.name,
                    allowNulls=False if edge.allowNulls is False else True,
                    domain=domain)

            domain = self._convert_domain(edge.domain)
            return Dict(name=coalesce(edge.name, edge.value),
                        value=edge.value,
                        range=edge.range,
                        allowNulls=False if edge.allowNulls is False else True,
                        domain=domain)
コード例 #19
0
def compileString2Term(edge):
    if edge.esscript:
        Log.error("edge script not supported yet")

    value = edge.value
    if isKeyword(value):
        value = strings.expand_template("getDocValue({{path}})",
                                        {"path": convert.string2quote(value)})
    else:
        Log.error("not handled")

    def fromTerm(value):
        return edge.domain.getPartByKey(value)

    return Dict(toTerm={"head": "", "body": value}, fromTerm=fromTerm)
コード例 #20
0
ファイル: query.py プロジェクト: klahnakoski/MoTreeherder
def _normalize_window(window, schema=None):
    v = window.value
    try:
        expr = jx_expression(v)
    except Exception:
        expr = ScriptOp("script", v)

    return Dict(
        name=coalesce(window.name, window.value),
        value=expr,
        edges=[_normalize_edge(e, schema) for e in listwrap(window.edges)],
        sort=_normalize_sort(window.sort),
        aggregate=window.aggregate,
        range=_normalize_range(window.range),
        where=_normalize_where(window.where, schema=schema))
コード例 #21
0
def compileTime2Term(edge):
    """
    RETURN MVEL CODE THAT MAPS TIME AND DURATION DOMAINS DOWN TO AN INTEGER AND
    AND THE JAVASCRIPT THAT WILL TURN THAT INTEGER BACK INTO A PARTITION (INCLUDING NULLS)
    """
    if edge.esscript:
        Log.error("edge script not supported yet")

    # IS THERE A LIMIT ON THE DOMAIN?
    numPartitions = len(edge.domain.partitions)
    value = edge.value
    if isKeyword(value):
        value = "doc[\"" + value + "\"].value"

    nullTest = compileNullTest(edge)
    ref = coalesce(edge.domain.min, edge.domain.max, datetime(2000, 1, 1))

    if edge.domain.interval.month > 0:
        offset = ref.subtract(ref.floorMonth(), durations.DAY).milli
        if offset > durations.DAY.milli * 28:
            offset = ref.subtract(ref.ceilingMonth(), durations.DAY).milli
        partition2int = "milli2Month(" + value + ", " + value2MVEL(
            offset) + ")"
        partition2int = "((" + nullTest + ") ? 0 : " + partition2int + ")"

        def int2Partition(value):
            if Math.round(value) == 0:
                return edge.domain.NULL

            d = datetime(str(value)[:4:], str(value).right(2), 1)
            d = d.addMilli(offset)
            return edge.domain.getPartByKey(d)
    else:
        partition2int = "Math.floor((" + value + "-" + value2MVEL(
            ref) + ")/" + edge.domain.interval.milli + ")"
        partition2int = "((" + nullTest + ") ? " + numPartitions + " : " + partition2int + ")"

        def int2Partition(value):
            if Math.round(value) == numPartitions:
                return edge.domain.NULL
            return edge.domain.getPartByKey(
                ref.add(edge.domain.interval.multiply(value)))

    return Dict(toTerm={
        "head": "",
        "body": partition2int
    },
                fromTerm=int2Partition)
コード例 #22
0
def _normalize_select_no_context(select, schema=None):
    """
    SAME NORMALIZE, BUT NO SOURCE OF COLUMNS
    """
    if not _Column:
        _late_import()

    if isinstance(select, basestring):
        select = Dict(value=select)
    else:
        select = wrap(select)

    output = select.copy()
    if not select.value:
        output.name = coalesce(select.name, select.aggregate)
        if output.name:
            output.value = jx_expression(".")
        else:
            return output
    elif isinstance(select.value, basestring):
        if select.value.endswith(".*"):
            output.name = coalesce(select.name, select.value[:-2],
                                   select.aggregate)
            output.value = LeavesOp("leaves", Variable(select.value[:-2]))
        else:
            if select.value == ".":
                output.name = coalesce(select.name, select.aggregate, ".")
                output.value = jx_expression(select.value)
            elif select.value == "*":
                output.name = coalesce(select.name, select.aggregate, ".")
                output.value = LeavesOp("leaves", Variable("."))
            else:
                output.name = coalesce(select.name, select.value,
                                       select.aggregate)
                output.value = jx_expression(select.value)
    else:
        output.value = jx_expression(select.value)

    if not output.name:
        Log.error("expecting select to have a name: {{select}}", select=select)
    if output.name.endswith(".*"):
        Log.error("{{name|quote}} is invalid select", name=output.name)

    output.aggregate = coalesce(canonical_aggregates[select.aggregate].name,
                                select.aggregate, "none")
    output.default = coalesce(select.default,
                              canonical_aggregates[output.aggregate].default)
    return output
コード例 #23
0
 def __init__(self,
              host,
              index,
              type=None,
              alias=None,
              name=None,
              port=9200,
              settings=None):
     self.settings = settings
     self.name = coalesce(name, alias, index)
     self._es = elasticsearch.Alias(alias=coalesce(alias, index),
                                    settings=settings)
     self.settings.type = self._es.settings.type  # Alias() WILL ASSIGN A TYPE IF IT WAS MISSING
     self.edges = Dict()
     self.worker = None
     self.ready = False
コード例 #24
0
ファイル: query.py プロジェクト: klahnakoski/MoTreeherder
def _normalize_edge(edge, schema=None):
    if not _Column:
        _late_import()

    if isinstance(edge, basestring):
        if schema:
            e = schema[edge]
            if e:
                if isinstance(e, _Column):
                    return Dict(name=edge,
                                value=jx_expression(edge),
                                allowNulls=True,
                                domain=_normalize_domain(schema=schema))
                elif isinstance(e.fields, list) and len(e.fields) == 1:
                    return Dict(name=e.name,
                                value=jx_expression(e.fields[0]),
                                allowNulls=True,
                                domain=e.getDomain())
                else:
                    return Dict(name=e.name,
                                allowNulls=True,
                                domain=e.getDomain())
        return Dict(name=edge,
                    value=jx_expression(edge),
                    allowNulls=True,
                    domain=_normalize_domain(schema=schema))
    else:
        edge = wrap(edge)
        if not edge.name and not isinstance(edge.value, basestring):
            Log.error("You must name compound and complex edges: {{edge}}",
                      edge=edge)

        if isinstance(edge.value, (list, set)) and not edge.domain:
            # COMPLEX EDGE IS SHORT HAND
            domain = _normalize_domain(schema=schema)
            domain.dimension = Dict(fields=edge.value)

            return Dict(name=edge.name,
                        allowNulls=bool(coalesce(edge.allowNulls, True)),
                        domain=domain)

        domain = _normalize_domain(edge.domain, schema=schema)

        return Dict(name=coalesce(edge.name, edge.value),
                    value=jx_expression(edge.value),
                    range=_normalize_range(edge.range),
                    allowNulls=bool(coalesce(edge.allowNulls, True)),
                    domain=domain)
コード例 #25
0
def wrap(v):
    type_ = _get(v, "__class__")

    if type_ is dict:
        m = Dict(v)
        return m
        # m = object.__new__(Dict)
        # object.__setattr__(m, "_dict", v)
        # return m

    elif type_ is NoneType:
        return Null
    elif type_ is list:
        return DictList(v)
    elif type_ is GeneratorType:
        return (wrap(vv) for vv in v)
    else:
        return v
コード例 #26
0
ファイル: crypto.py プロジェクト: davehunt/ActiveData
def encrypt(text, _key, salt=None):
    """
    RETURN JSON OF ENCRYPTED DATA   {"salt":s, "length":l, "data":d}
    """
    from pyLibrary.queries import jx

    if not isinstance(text, unicode):
        Log.error("only unicode is encrypted")
    if _key is None:
        Log.error("Expecting a key")
    if isinstance(_key, str):
        _key = bytearray(_key)
    if salt is None:
        salt = Random.bytes(16)

    data = bytearray(text.encode("utf8"))

    # Initialize encryption using key and iv
    key_expander_256 = key_expander.KeyExpander(256)
    expanded_key = key_expander_256.expand(_key)
    aes_cipher_256 = aes_cipher.AESCipher(expanded_key)
    aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16)
    aes_cbc_256.set_iv(salt)

    output = Dict()
    output.type = "AES256"
    output.salt = convert.bytes2base64(salt)
    output.length = len(data)

    encrypted = bytearray()
    for _, d in jx.groupby(data, size=16):
        encrypted.extend(aes_cbc_256.encrypt_block(d))
    output.data = convert.bytes2base64(encrypted)
    json = convert.value2json(output)

    if DEBUG:
        test = decrypt(json, _key)
        if test != text:
            Log.error("problem with encryption")

    return json
コード例 #27
0
    def __init__(self,
                 host,
                 index,
                 alias=None,
                 name=None,
                 port=9200,
                 settings=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = settings
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(settings=settings)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now() - OLD_METADATA

        self.meta = Dict()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer(
            "meta.tables", [], wrap({c.name: c
                                     for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
コード例 #28
0
    def get_columns(self, _from_name=None):
        """
        ENSURE COLUMNS FOR GIVEN INDEX/QUERY ARE LOADED, SCRIPT COMPILATION WILL WORK BETTER

        _from_name - NOT MEANT FOR EXTERNAL USE
        """

        if _from_name is None:
            _from_name = self.name
        if not isinstance(_from_name, basestring):
            Log.error("Expecting string")

        output = INDEX_CACHE.get(_from_name)
        if output:
            # VERIFY es IS CONSISTENT
            if self.url != output.url:
                Log.error(
                    "Using {{name}} for two different containers\n\t{{existing}}\n\t{{new}}",
                    name=_from_name,
                    existing=output.url,
                    new=self._es.url)
            return output.columns

        path = split_field(_from_name)
        if len(path) > 1:
            # LOAD THE PARENT (WHICH WILL FILL THE INDEX_CACHE WITH NESTED CHILDREN)
            self.get_columns(_from_name=path[0])
            return INDEX_CACHE[_from_name].columns

        schema = self._es.get_schema()
        properties = schema.properties
        INDEX_CACHE[_from_name] = output = Dict()
        output.name = _from_name
        output.url = self._es.url
        output.columns = parse_columns(_from_name, properties)
        return output.columns
コード例 #29
0
ファイル: meta.py プロジェクト: klahnakoski/MoTreeherder
    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in ["object", "nested"]:
            Log.error("not supported")
        try:
            if c.table == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[c.es_column]
                        for g, _ in jx.groupby(self.meta.columns, c.es_column)
                        if g[c.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "table": c.table,
                                "es_column": c.es_column
                            }
                        }
                    })
                return
            if c.table == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[c.es_column]
                        for g, _ in jx.groupby(self.meta.tables, c.es_column)
                        if g[c.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "table": c.table,
                                "name": c.name
                            }
                        }
                    })
                return

            es_index = c.table.split(".")[0]
            result = self.default_es.post("/" + es_index + "/_search",
                                          data={
                                              "aggs": {
                                                  c.name: _counting_query(c)
                                              },
                                              "size": 0
                                          })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value,
                                   0 if r.doc_count == 0 else None)
            if cardinality == None:
                Log.error("logic error")

            query = Dict(size=0)
            if cardinality > 1000 or (count >= 30 and cardinality == count
                                      ) or (count >= 1000
                                            and cardinality / count > 0.99):
                Log.note("{{table}}.{{field}} has {{num}} parts",
                         table=c.table,
                         field=c.es_column,
                         num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": c.es_index,
                                "es_column": c.es_column
                            }
                        }
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                Log.note("{{field}} has {{num}} parts",
                         field=c.name,
                         num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": c.es_index,
                                "es_column": c.es_column
                            }
                        }
                    })
                return
            elif c.nested_path:
                query.aggs[literal_field(c.name)] = {
                    "nested": {
                        "path": listwrap(c.nested_path)[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": c.es_column,
                                "size": 0
                            }
                        }
                    }
                }
            else:
                query.aggs[literal_field(c.name)] = {
                    "terms": {
                        "field": c.es_column,
                        "size": 0
                    }
                }

            result = self.default_es.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {
                        "eq": {
                            "es_index": c.es_index,
                            "es_column": c.es_column
                        }
                    }
                })
        except Exception, e:
            if "IndexMissingException" in e and c.table.startswith(
                    TEST_TABLE_PREFIX):
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": 0,
                            "cardinality": 0,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": c.es_index,
                                "es_column": c.es_column
                            }
                        }
                    })
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "table": c.table,
                            "es_column": c.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.table}}.{{col.es_column}} info",
                    col=c,
                    cause=e)
コード例 #30
0
ファイル: query.py プロジェクト: klahnakoski/MoDevETL
 def __getitem__(self, item):
     if item == "from":
         return self.frum
     return Dict.__getitem__(self, item)
コード例 #31
0
ファイル: __init__.py プロジェクト: davehunt/ActiveData
from __future__ import division
from __future__ import unicode_literals

from collections import Mapping
from copy import copy
from types import GeneratorType

from pyLibrary.debugs.logs import Log
from pyLibrary.dot import set_default, split_field, wrap, join_field
from pyLibrary.dot.dicts import Dict

OBJECT = "object"
NESTED = "nested"
STRUCT = [OBJECT, NESTED]

type2container = Dict()
config = Dict()  # config.default IS EXPECTED TO BE SET BEFORE CALLS ARE MADE
_ListContainer = None
_Cube = None
_run = None
_Query = None
_Normal = None


def _delayed_imports():
    global type2container
    global _ListContainer
    global _Cube
    global _run
    global _Query
    global _Normal