Ejemplo n.º 1
0
def value2url_param(value):
    """
    :param value:
    :return: ascii URL
    """
    from mo_json import value2json, json2value

    def _encode(value):
        return "".join(_map2url[c] for c in value.encode("utf8"))

    if value == None:
        return None

    if is_data(value):
        value_ = to_data(value)
        output = "&".join(
            kk + "=" + vv
            for k, v in sorted(value_.leaves(), key=lambda p: p[0])
            for kk, vv in [(value2url_param(k), value2url_param(v))]
            if vv or vv == 0
        )
    elif is_text(value):
        try:
            json2value(value)
            output = _encode(value2json(value))
        except Exception:
            output = _encode(value)
    elif is_binary(value):
        output = "".join(_map2url[c] for c in value)
    elif is_many(value):
        output = ",".join(
            vv for v in value for vv in [value2url_param(v)] if vv or vv == 0
        )
    else:
        output = _encode(value2json(value))
    return output
Ejemplo n.º 2
0
    def test_assign3(self):
        # IMPOTENT ASSIGNMENTS DO NOTHING
        a = {}
        b = to_data(a)

        b.c = None
        expected = {}
        self.assertEqual(a, expected)

        b.c.d = None
        expected = {}
        self.assertEqual(a, expected)

        b["c.d"] = None
        expected = {}
        self.assertEqual(a, expected)

        b.c.d.e = None
        expected = {}
        self.assertEqual(a, expected)

        b.c["d.e"] = None
        expected = {}
        self.assertEqual(a, expected)
Ejemplo n.º 3
0
 def define(cls, expr):
     expr = to_data(expr)
     term = expr.select
     terms = []
     if not is_container(term):
         raise Log.error("Expecting a list")
     for t in term:
         if is_text(t):
             if not is_variable_name(t):
                 Log.error(
                     "expecting {{value}} a simple dot-delimited path name",
                     value=t)
             terms.append({"name": t, "value": _jx_expression(t, cls.lang)})
         elif t.name == None:
             if t.value == None:
                 Log.error(
                     "expecting select parameters to have name and value properties"
                 )
             elif is_text(t.value):
                 if not is_variable_name(t):
                     Log.error(
                         "expecting {{value}} a simple dot-delimited path name",
                         value=t.value,
                     )
                 else:
                     terms.append({
                         "name":
                         t.value,
                         "value":
                         _jx_expression(t.value, cls.lang),
                     })
             else:
                 Log.error("expecting a name property")
         else:
             terms.append({"name": t.name, "value": jx_expression(t.value)})
     return (SelectOp(terms))
Ejemplo n.º 4
0
 def metas(self, prefix=None, limit=None, delimiter=None):
     """
     RETURN THE METADATA DESCRIPTORS FOR EACH KEY
     """
     limit = coalesce(limit, TOO_MANY_KEYS)
     keys = self.bucket.list(prefix=str(prefix), delimiter=str(delimiter))
     prefix_len = len(prefix)
     output = []
     for i, k in enumerate(
         k
         for k in keys
         if len(k.key) == prefix_len or k.key[prefix_len] in [".", ":"]
     ):
         output.append(
             {
                 "key": strip_extension(k.key),
                 "etag": convert.quote2string(k.etag),
                 "expiry_date": Date(k.expiry_date),
                 "last_modified": Date(k.last_modified),
             }
         )
         if i >= limit:
             break
     return to_data(output)
Ejemplo n.º 5
0
def DataClass(name, columns, constraint=None):
    """
    Use the DataClass to define a class, but with some extra features:
    1. restrict the datatype of property
    2. restrict if `required`, or if `nulls` are allowed
    3. generic constraints on object properties

    It is expected that this class become a real class (or be removed) in the
    long term because it is expensive to use and should only be good for
    verifying program correctness, not user input.

    :param name: Name of the class we are creating
    :param columns: Each columns[i] has properties {
            "name",     - (required) name of the property
            "required", - False if it must be defined (even if None)
            "nulls",    - True if property can be None, or missing
            "default",  - A default value, if none is provided
            "type"      - a Python datatype
        }
    :param constraint: a JSON query Expression for extra constraints (return true if all constraints are met)
    :return: The class that has been created
    """

    columns = to_data([{
        "name": c,
        "required": True,
        "nulls": False,
        "type": object
    } if is_text(c) else c for c in columns])
    constraint = {
        "and":
        [{
            "exists": c.name
        }
         for c in columns if not c.nulls and c.default == None] + [constraint]
    }
    slots = columns.name
    required = to_data(
        filter(lambda c: c.required and c.default == None, columns)).name
    # nulls = to_data(filter(lambda c: c.nulls, columns)).name
    defaults = {c.name: coalesce(c.default, None) for c in columns}
    types = {c.name: coalesce(c.jx_type, object) for c in columns}

    code = expand_template(
        """
from __future__ import unicode_literals
from mo_future import is_text, is_binary
from collections import Mapping

meta = None
types_ = {{types}}
defaults_ = {{defaults}}

class {{class_name}}(Mapping):
    __slots__ = {{slots}}


    def _constraint(row, rownum, rows):
        code = {{constraint_expr|quote}}
        if {{constraint_expr}}:
            return
        Log.error(
            "constraint\\n{" + "{code}}\\nnot satisfied {" + "{expect}}\\n{" + "{value|indent}}",
            code={{constraint_expr|quote}},
            expect={{constraint}},
            value=row
        )

    def __init__(self, **kwargs):
        if not kwargs:
            return

        for s in {{slots}}:
            object.__setattr__(self, s, kwargs.get(s, {{defaults}}.get(s, None)))

        missed = {{required}}-set(kwargs.keys())
        if missed:
            Log.error("Expecting properties {"+"{missed}}", missed=missed)

        illegal = set(kwargs.keys())-set({{slots}})
        if illegal:
            Log.error("{"+"{names}} are not a valid properties", names=illegal)

        self._constraint(0, [self])

    def __getitem__(self, item):
        return getattr(self, item)

    def __setitem__(self, item, value):
        setattr(self, item, value)
        return self

    def __setattr__(self, item, value):
        if item not in {{slots}}:
            Log.error("{"+"{item|quote}} not valid attribute", item=item)

        if value==None and item in {{required}}:
            Log.error("Expecting property {"+"{item}}", item=item)

        object.__setattr__(self, item, value)
        self._constraint(0, [self])

    def __getattr__(self, item):
        Log.error("{"+"{item|quote}} not valid attribute", item=item)

    def __hash__(self):
        return object.__hash__(self)

    def __eq__(self, other):
        if isinstance(other, {{class_name}}) and dict(self)==dict(other) and self is not other:
            Log.error("expecting to be same object")
        return self is other

    def __dict__(self):
        return {k: getattr(self, k) for k in {{slots}}}

    def items(self):
        return ((k, getattr(self, k)) for k in {{slots}})

    def __copy__(self):
        _set = object.__setattr__
        output = object.__new__({{class_name}})
        {{assign}}
        return output

    def __iter__(self):
        return {{slots}}.__iter__()

    def __len__(self):
        return {{len_slots}}

    def __str__(self):
        return str({{dict}})

""",
        {
            "class_name":
            name,
            "slots":
            "(" + ", ".join(quote(s) for s in slots) + ")",
            "required":
            "{" + ", ".join(quote(s) for s in required) + "}",
            "defaults":
            Literal(defaults).to_python(),
            "len_slots":
            len(slots),
            "dict":
            "{" + ", ".join(quote(s) + ": self." + s for s in slots) + "}",
            "assign":
            "; ".join("_set(output, " + quote(s) + ", self." + s + ")"
                      for s in slots),
            "types":
            "{" +
            ",".join(quote(k) + ": " + v.__name__
                     for k, v in types.items()) + "}",
            "constraint_expr":
            jx_expression(not ENABLE_CONSTRAINTS or constraint).to_python(),
            "constraint":
            value2json(constraint),
        },
    )

    output = _exec(code, name)
    register_data(output)
    return output
Ejemplo n.º 6
0
def set(constants):
    """
    REACH INTO THE MODULES AND OBJECTS TO SET CONSTANTS.
    THINK OF THIS AS PRIMITIVE DEPENDENCY INJECTION FOR MODULES.
    USEFUL FOR SETTING DEBUG FLAGS.
    """
    if not constants:
        return
    constants = to_data(constants)

    for full_path, new_value in constants.leaves():
        errors = []
        k_path = split_field(full_path)
        if len(k_path) < 2:
            from mo_logs import Log

            Log.error(
                "expecting <module>.<constant> format, not {{path|quote}}",
                path=k_path)
        name = k_path[-1]
        try:
            mo_dots_set_attr(sys.modules, k_path, new_value)
            continue
        except Exception as e:
            errors.append(e)

        # ONE MODULE IS MISSING, THE CALLING MODULE
        try:
            caller_globals = sys._getframe(1).f_globals
            caller_file = caller_globals["__file__"]
            if not caller_file.endswith(".py"):
                raise Exception("do not know how to handle non-python caller")
            caller_module = caller_file[:-3].replace("\\", "/")
            module_path = caller_module.split("/")

            # ENSURE THERE IS SOME EVIDENCE THE MODULE MATCHES THE PATH
            if k_path[-2] != module_path[-1]:
                continue

            old_value = mo_dots_set_attr(caller_globals, [name], new_value)
            if DEBUG:
                from mo_logs import Log

                Log.note(
                    "Changed {{module}}[{{attribute}}] from {{old_value}} to"
                    " {{new_value}}",
                    module=caller_module,
                    attribute=name,
                    old_value=old_value,
                    new_value=new_value,
                )
            break
        except Exception as e:
            errors.append(e)

        if errors:
            from mo_logs import Log

            Log.error("Can not set constant {{path}}",
                      path=full_path,
                      cause=errors)
Ejemplo n.º 7
0
 def add(self, message):
     message = to_data(message)
     m = Message()
     m.set_body(value2json(message))
     self.queue.write(m)
Ejemplo n.º 8
0
from mo_math import is_nan
from mo_times import Date, YEAR, WEEK, MONTH
from pandas import DataFrame

from utils import nice_ceiling

# PROVINCE = 7  # Ontario
# PROVINCE = 10  # Alberta
PROVINCE = 11  # British Columbia

Log.start(trace=True)

http.DEBUG = True
http.default_headers = to_data({
    "From": "*****@*****.**",
    "Referer": "https://github.com/klahnakoski/mo-statcan",
    "User-Agent": "mo-statscan",
    "Accept": mimetype.ANY,
})

# LESS DETAILED CAUSES
CAUSE_OF_DEATH = (
    13_10_0394  # https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310039401
)

# DETAILED CAUSES
GROUPED_CAUSE_DEATH = (
    13_10_0392  # https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310039201
)

#
WEEKLY_DEATHS = (
Ejemplo n.º 9
0
    def pe_filter(filter, data, depth):
        """
        PARTIAL EVALUATE THE filter BASED ON data GIVEN
        """
        if filter is TRUE:
            return True
        if filter is FALSE:
            return False

        filter = to_data(filter)

        if filter["and"]:
            result = True
            output = FlatList()
            for a in filter["and"]:
                f = pe_filter(a, data, depth)
                if f is False:
                    result = False
                elif f is not True:
                    output.append(f)
            if result and output:
                return {"and": output}
            else:
                return result
        elif filter["or"]:
            output = FlatList()
            for o in filter["or"]:
                f = pe_filter(o, data, depth)
                if f is True:
                    return True
                elif f is not False:
                    output.append(f)
            if output:
                return {"or": output}
            else:
                return False
        elif filter["not"]:
            f = pe_filter(filter["not"], data, depth)
            if f is True:
                return False
            elif f is False:
                return True
            else:
                return {"not": f}
        elif filter.term or filter.eq:
            eq = coalesce(filter.term, filter.eq)
            result = True
            output = {}
            for col, val in eq.items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d != val:
                        result = False
                else:
                    output[rest] = val

            if result and output:
                return {"term": output}
            else:
                return result
        elif filter.equal:
            a, b = filter["equal"]
            first_a, rest_a = parse_field(a, data, depth)
            first_b, rest_b = parse_field(b, data, depth)
            val_a = data[first_a]
            val_b = data[first_b]
            if not rest_a:
                if not rest_b:
                    if val_a != val_b:
                        return False
                    else:
                        return True
                else:
                    return {"term": {rest_b: val_a}}
            else:
                if not rest_b:
                    return {"term": {rest_a: val_b}}
                else:
                    return {"equal": [rest_a, rest_b]}

        elif filter.terms:
            result = True
            output = {}
            for col, vals in filter["terms"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d not in vals:
                        result = False
                else:
                    output[rest] = vals
            if result and output:
                return {"terms": output}
            else:
                return result

        elif filter.range:
            result = True
            output = {}
            for col, ranges in filter["range"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    for sign, val in ranges.items():
                        if sign in ("gt", ">") and d <= val:
                            result = False
                        if sign == "gte" and d < val:
                            result = False
                        if sign == "lte" and d > val:
                            result = False
                        if sign == "lt" and d >= val:
                            result = False
                else:
                    output[rest] = ranges
            if result and output:
                return {"range": output}
            else:
                return result
        elif filter.missing:
            if is_text(filter.missing):
                field = filter["missing"]
            else:
                field = filter["missing"]["field"]

            first, rest = parse_field(field, data, depth)
            d = data[first]
            if not rest:
                if d == None:
                    return True
                return False
            else:
                return {"missing": rest}
        elif filter.prefix:
            result = True
            output = {}
            for col, val in filter["prefix"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d == None or not d.startswith(val):
                        result = False
                else:
                    output[rest] = val
            if result and output:
                return {"prefix": output}
            else:
                return result

        elif filter.exists:
            if is_text(filter["exists"]):
                field = filter["exists"]
            else:
                field = filter["exists"]["field"]

            first, rest = parse_field(field, data, depth)
            d = data[first]
            if not rest:
                if d != None:
                    return True
                return False
            else:
                return {"exists": rest}
        else:
            Log.error("Can not interpret esfilter: {{esfilter}}",
                      {"esfilter": filter})
Ejemplo n.º 10
0
def run(query, container=Null):
    """
    THIS FUNCTION IS SIMPLY SWITCHING BASED ON THE query["from"] CONTAINER,
    BUT IT IS ALSO PROCESSING A list CONTAINER; SEPARATE TO A ListContainer
    """
    if container == None:
        container = to_data(query)["from"]
        query_op = QueryOp.wrap(query,
                                container=container,
                                namespace=container.schema)
    else:
        query_op = QueryOp.wrap(query,
                                container=container,
                                namespace=container.namespace)

    if container == None:
        from jx_python.containers.list import DUAL

        return DUAL.query(query_op)
    elif isinstance(container, Container):
        return container.query(query_op)
    elif is_many(container):
        container = ListContainer(name=None, data=list(container))
    elif isinstance(container, Cube):
        if is_aggs(query_op):
            return cube_aggs(container, query_op)
    elif is_op(container, QueryOp):
        container = run(container)
    elif is_data(container):
        query = container
        container = query["from"]
        container = run(QueryOp.wrap(query, container, container.namespace),
                        container)
    else:
        Log.error("Do not know how to handle {{type}}",
                  type=container.__class__.__name__)

    if is_aggs(query_op):
        container = list_aggs(container, query_op)
    else:  # SETOP
        if query_op.where is not TRUE:
            container = filter(container, query_op.where)

        if query_op.sort:
            container = sort(container, query_op.sort, already_normalized=True)

        if query_op.select:
            container = select(container, query_op.select)

    if query_op.window:
        if isinstance(container, Cube):
            container = list(container.values())

        for param in query_op.window:
            window(container, param)

    # AT THIS POINT frum IS IN LIST FORMAT, NOW PACKAGE RESULT
    if query_op.format == "cube":
        container = list2cube(container)
    elif query_op.format == "table":
        container = list2table(container)
        container.meta.format = "table"
    else:
        container = dict_to_data({
            "meta": {
                "format": "list"
            },
            "data": container
        })

    return container
Ejemplo n.º 11
0
 def _output():
     for g, v in itertools.groupby(data, get_key):
         group = Data()
         for k, gg in zip(keys, g):
             group[k] = gg
         yield (group, to_data(list(v)))
Ejemplo n.º 12
0
 def selector(d):
     output = Data()
     for n, p in push_and_pull:
         output[n] = unwraplist(p(to_data(d)))
     return unwrap(output)
def format_table(aggs, es_query, query, decoders, all_selects):
    new_edges = to_data(count_dim(aggs, es_query, decoders))
    dims = tuple(len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges)
    rank = len(dims)
    header = tuple(new_edges.name + all_selects.name)
    name2index = {s.name: i + rank for i, s in enumerate(all_selects)}

    def data():
        is_sent = Matrix(dims=dims)
        give_me_zeros = query.sort and not query.groupby
        if give_me_zeros:
            # WE REQUIRE THE ZEROS FOR SORTING
            all_coord = is_sent._all_combos()  # TRACK THE EXPECTED COMBINATIONS
            ordered_coord = next(all_coord)[::-1]
            output = None
            for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders):
                if coord != ordered_coord:
                    # output HAS BEEN YIELDED, BUT SET THE DEFAULT VALUES
                    if output is not None:
                        for s in all_selects:
                            i = name2index[s.name]
                            if output[i] is None:
                                output[i] = s.default
                        # WE CAN GET THE SAME coord MANY TIMES, SO ONLY ADVANCE WHEN NOT
                        ordered_coord = next(all_coord)[::-1]

                while coord != ordered_coord:
                    # HAPPENS WHEN THE coord IS AHEAD OF ordered_coord
                    record = [d.get_value(ordered_coord[i]) for i, d in enumerate(decoders)] + [s.default for s in all_selects]
                    yield record
                    ordered_coord = next(all_coord)[::-1]
                # coord == missing_coord
                output = [d.get_value(c) for c, d in zip(coord, decoders)] + [None for s in all_selects]
                for select in ss:
                    v = select.pull(agg)
                    if v != None:
                        union(output, name2index[select.name], v, select.aggregate)
                yield output
        else:
            last_coord = None   # HANG ONTO THE output FOR A BIT WHILE WE FILL THE ELEMENTS
            output = None
            for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders):
                if coord != last_coord:
                    if output:
                        # SET DEFAULTS
                        for i, s in enumerate(all_selects):
                            v = output[rank+i]
                            if v == None:
                                output[rank+i] = s.default
                        yield output
                    output = is_sent[coord]
                    if output == None:
                        output = is_sent[coord] = [d.get_value(c) for c, d in zip(coord, decoders)] + [None for _ in all_selects]
                    last_coord = coord
                # THIS IS A TRICK!  WE WILL UPDATE A ROW THAT WAS ALREADY YIELDED
                for select in ss:
                    v = select.pull(agg)
                    if v != None:
                        union(output, name2index[select.name], v, select.aggregate)

            if output:
                # SET DEFAULTS ON LAST ROW
                for i, s in enumerate(all_selects):
                    v = output[rank+i]
                    if v == None:
                        output[rank+i] = s.default
                yield output

            # EMIT THE MISSING CELLS IN THE CUBE
            if not query.groupby:
                for coord, output in is_sent:
                    if output == None:
                        record = [d.get_value(c) for c, d in zip(coord, decoders)] + [s.default for s in all_selects]
                        yield record

    return Data(
        meta={"format": "table"},
        header=header,
        data=list(data())
    )
Ejemplo n.º 14
0
 columns=to_data([
     Column(
         name=c,
         es_index=META_COLUMNS_NAME,
         es_column=c,
         es_type="keyword",
         jx_type=STRING,
         last_updated=Date.now(),
         nested_path=ROOT_PATH,
     ) for c in [
         "name",
         "es_type",
         "jx_type",
         "nested_path",
         "es_column",
         "es_index",
         "partitions",
     ]
 ] + [
     Column(
         name=c,
         es_index=META_COLUMNS_NAME,
         es_column=c,
         es_type="integer",
         jx_type=INTEGER,
         last_updated=Date.now(),
         nested_path=ROOT_PATH,
     ) for c in ["count", "cardinality", "multi"]
 ] + [
     Column(name="last_updated",
            es_index=META_COLUMNS_NAME,
            es_column="last_updated",
            es_type="double",
            jx_type=NUMBER,
            last_updated=Date.now(),
            nested_path=ROOT_PATH)
 ]))
Ejemplo n.º 15
0
def list_aggs(frum, query):
    frum = to_data(frum)
    select = listwrap(query.select)

    for e in query.edges:
        if isinstance(e.domain, DefaultDomain):
            accessor = jx_expression_to_function(e.value)
            unique_values = set(map(accessor, frum))
            if None in unique_values:
                e.allowNulls = coalesce(e.allowNulls, True)
                unique_values -= {None}
            e.domain = SimpleSetDomain(partitions=list(sorted(unique_values)))
        else:
            pass

    s_accessors = [(ss.name, jx_expression_to_function(ss.value))
                   for ss in select]

    result = {
        s.name: Matrix(dims=[
            len(e.domain.partitions) + (1 if e.allowNulls else 0)
            for e in query.edges
        ],
                       zeros=lambda: windows.name2accumulator.get(s.aggregate)
                       (**s))
        for s in select
    }
    where = jx_expression_to_function(query.where)
    coord = [None] * len(query.edges)
    edge_accessor = [(i, make_accessor(e)) for i, e in enumerate(query.edges)]

    net_new_edge_names = set(to_data(query.edges).name) - UNION(
        e.value.vars() for e in query.edges)
    if net_new_edge_names & UNION(ss.value.vars() for ss in select):
        # s_accessor NEEDS THESE EDGES, SO WE PASS THEM ANYWAY
        for d in filter(where, frum):
            d = d.copy()
            for c, get_matches in edge_accessor:
                coord[c] = get_matches(d)

            for s_name, s_accessor in s_accessors:
                mat = result[s_name]
                for c in itertools.product(*coord):
                    acc = mat[c]
                    for e, cc in zip(query.edges, c):
                        d[e.name] = e.domain.partitions[cc]
                    val = s_accessor(d, c, frum)
                    acc.add(val)
    else:
        # FASTER
        for d in filter(where, frum):
            for c, get_matches in edge_accessor:
                coord[c] = get_matches(d)

            for s_name, s_accessor in s_accessors:
                mat = result[s_name]
                for c in itertools.product(*coord):
                    acc = mat[c]
                    val = s_accessor(d, c, frum)
                    acc.add(val)

    for s in select:
        # if s.aggregate == "count":
        #     continue
        m = result[s.name]
        for c, var in m.items():
            if var != None:
                m[c] = var.end()

    from jx_python.containers.cube import Cube

    output = Cube(select, query.edges, result)
    return output
def doc_to_column(doc):
    now = Date.now()
    try:
        doc = to_data(untyped(doc))

        # I HAVE MANAGED TO MAKE MANY MISTAKES WRITING COLUMNS TO ES. HERE ARE THE FIXES

        # FIX
        if not doc.last_updated:
            doc.last_updated = Date.now() - YEAR

        # FIX
        if doc.es_type == None:
            if doc.jx_type == OBJECT:
                doc.es_type = "object"
            else:
                Log.warning("{{doc}} has no es_type", doc=doc)

        # FIX
        if doc.es_type == "nested":
            doc.multi = 1001
        if doc.multi == None:
            doc.multi = 1

        # FIX
        if doc.es_column.endswith("." + NESTED_TYPE):
            if doc.jx_type == OBJECT:
                doc.jx_type = NESTED
                doc.last_updated = now
            if doc.es_type == "nested":
                doc.es_type = "nested"
                doc.last_updated = now

        # FIX
        doc.nested_path = tuple(listwrap(doc.nested_path))
        if last(split_field(
                doc.es_column)) == NESTED_TYPE and doc.es_type != "nested":
            doc.es_type = "nested"
            doc.jx_type = NESTED
            doc.multi = 1001
            doc.last_updated = now

        # FIX
        expected_nested_path = get_nested_path(doc.es_column)
        if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.':
            doc.nested_path = doc.nested_path[:-1]
            doc.last_updated = now

        # FIX
        if untype_path(doc.es_column) == doc.es_column:
            if doc.nested_path != (".", ):
                if doc.es_index in {"repo"}:
                    pass
                else:
                    Log.note("not expected")
                    doc.nested_path = expected_nested_path
                    doc.last_updated = now
        else:
            if doc.nested_path != expected_nested_path:
                doc.nested_path = expected_nested_path
                doc.last_updated = now

        # FIX
        if last(split_field(doc.es_column)) == EXISTS_TYPE:
            if doc.jx_type != EXISTS:
                doc.jx_type = EXISTS
                doc.last_updated = now

            if doc.cardinality == None:
                doc.cardinality = 1
                doc.last_updated = now

        # FIX
        if doc.jx_type in STRUCT:
            if doc.cardinality not in [0, 1]:
                doc.cardinality = 1  # DO NOT KNOW IF EXISTS OR NOT
                doc.last_updated = now

        return Column(**doc)
    except Exception as e:
        try:
            mark_as_deleted(Column(**doc), now)
        except Exception:
            pass
        return None
Ejemplo n.º 17
0
 def assign(source, destination):
     destination[name] = field.value(to_data(source))
     return 0, None
Ejemplo n.º 18
0
 def __iter__(self):
     return (to_data(d) for d in self.data)
Ejemplo n.º 19
0
def drill_filter(esfilter, data):
    """
    PARTIAL EVALUATE THE FILTER BASED ON DATA GIVEN

    TODO:  FIX THIS MONUMENTALLY BAD IDEA
    """
    esfilter = unwrap(esfilter)
    primary_nested = []  # track if nested, changes if not
    primary_column = []  # only one path allowed
    primary_branch = (
        []
    )  # CONTAINS LISTS OF RECORDS TO ITERATE: constantly changing as we dfs the tree

    def parse_field(fieldname, data, depth):
        """
        RETURN (first, rest) OF fieldname
        """
        col = split_field(fieldname)
        d = data
        for i, c in enumerate(col):
            try:
                d = d[c]
            except Exception as e:
                Log.error("{{name}} does not exist", name=fieldname)
            if is_list(d) and len(col) > 1:
                if len(primary_column) <= depth + i:
                    primary_nested.append(True)
                    primary_column.append(c)
                    primary_branch.append(d)
                elif primary_nested[depth] and primary_column[depth + i] != c:
                    Log.error("only one branch of tree allowed")
                else:
                    primary_nested[depth + i] = True
                    primary_column[depth + i] = c
                    primary_branch[depth + i] = d

                return c, join_field(col[i + 1:])
            else:
                if len(primary_column) <= depth + i:
                    primary_nested.append(False)
                    primary_column.append(c)
                    primary_branch.append([d])
        return fieldname, None

    def pe_filter(filter, data, depth):
        """
        PARTIAL EVALUATE THE filter BASED ON data GIVEN
        """
        if filter is TRUE:
            return True
        if filter is FALSE:
            return False

        filter = to_data(filter)

        if filter["and"]:
            result = True
            output = FlatList()
            for a in filter["and"]:
                f = pe_filter(a, data, depth)
                if f is False:
                    result = False
                elif f is not True:
                    output.append(f)
            if result and output:
                return {"and": output}
            else:
                return result
        elif filter["or"]:
            output = FlatList()
            for o in filter["or"]:
                f = pe_filter(o, data, depth)
                if f is True:
                    return True
                elif f is not False:
                    output.append(f)
            if output:
                return {"or": output}
            else:
                return False
        elif filter["not"]:
            f = pe_filter(filter["not"], data, depth)
            if f is True:
                return False
            elif f is False:
                return True
            else:
                return {"not": f}
        elif filter.term or filter.eq:
            eq = coalesce(filter.term, filter.eq)
            result = True
            output = {}
            for col, val in eq.items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d != val:
                        result = False
                else:
                    output[rest] = val

            if result and output:
                return {"term": output}
            else:
                return result
        elif filter.equal:
            a, b = filter["equal"]
            first_a, rest_a = parse_field(a, data, depth)
            first_b, rest_b = parse_field(b, data, depth)
            val_a = data[first_a]
            val_b = data[first_b]
            if not rest_a:
                if not rest_b:
                    if val_a != val_b:
                        return False
                    else:
                        return True
                else:
                    return {"term": {rest_b: val_a}}
            else:
                if not rest_b:
                    return {"term": {rest_a: val_b}}
                else:
                    return {"equal": [rest_a, rest_b]}

        elif filter.terms:
            result = True
            output = {}
            for col, vals in filter["terms"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d not in vals:
                        result = False
                else:
                    output[rest] = vals
            if result and output:
                return {"terms": output}
            else:
                return result

        elif filter.range:
            result = True
            output = {}
            for col, ranges in filter["range"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    for sign, val in ranges.items():
                        if sign in ("gt", ">") and d <= val:
                            result = False
                        if sign == "gte" and d < val:
                            result = False
                        if sign == "lte" and d > val:
                            result = False
                        if sign == "lt" and d >= val:
                            result = False
                else:
                    output[rest] = ranges
            if result and output:
                return {"range": output}
            else:
                return result
        elif filter.missing:
            if is_text(filter.missing):
                field = filter["missing"]
            else:
                field = filter["missing"]["field"]

            first, rest = parse_field(field, data, depth)
            d = data[first]
            if not rest:
                if d == None:
                    return True
                return False
            else:
                return {"missing": rest}
        elif filter.prefix:
            result = True
            output = {}
            for col, val in filter["prefix"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d == None or not d.startswith(val):
                        result = False
                else:
                    output[rest] = val
            if result and output:
                return {"prefix": output}
            else:
                return result

        elif filter.exists:
            if is_text(filter["exists"]):
                field = filter["exists"]
            else:
                field = filter["exists"]["field"]

            first, rest = parse_field(field, data, depth)
            d = data[first]
            if not rest:
                if d != None:
                    return True
                return False
            else:
                return {"exists": rest}
        else:
            Log.error("Can not interpret esfilter: {{esfilter}}",
                      {"esfilter": filter})

    output = []  # A LIST OF OBJECTS MAKING THROUGH THE FILTER

    def main(sequence, esfilter, row, depth):
        """
        RETURN A SEQUENCE OF REFERENCES OF OBJECTS DOWN THE TREE
        SHORT SEQUENCES MEANS ALL NESTED OBJECTS ARE INCLUDED
        """
        new_filter = pe_filter(esfilter, row, depth)
        if new_filter is True:
            seq = list(sequence)
            seq.append(row)
            output.append(seq)
            return
        elif new_filter is False:
            return

        seq = list(sequence)
        seq.append(row)
        for d in primary_branch[depth]:
            main(seq, new_filter, d, depth + 1)

    # OUTPUT
    for i, d in enumerate(data):
        if is_data(d):
            main([], esfilter, to_data(d), 0)
        else:
            Log.error("filter is expecting a dict, not {{type}}",
                      type=d.__class__)

    # AT THIS POINT THE primary_column[] IS DETERMINED
    # USE IT TO EXPAND output TO ALL NESTED OBJECTS
    max = 0  # EVEN THOUGH A ROW CAN HAVE MANY VALUES, WE ONLY NEED UP TO max
    for i, n in enumerate(primary_nested):
        if n:
            max = i + 1

    # OUTPUT IS A LIST OF ROWS,
    # WHERE EACH ROW IS A LIST OF VALUES SEEN DURING A WALK DOWN A PATH IN THE HIERARCHY
    uniform_output = FlatList()

    def recurse(row, depth):
        if depth == max:
            uniform_output.append(row)
        else:
            nested = row[-1][primary_column[depth]]
            if not nested:
                # PASSED FILTER, BUT NO CHILDREN, SO ADD NULL CHILDREN
                for i in range(depth, max):
                    row.append(None)
                uniform_output.append(row)
            else:
                for d in nested:
                    r = list(row)
                    r.append(d)
                    recurse(r, depth + 1)

    for o in output:
        recurse(o, 0)

    if not max:
        # SIMPLE LIST AS RESULT
        return list_to_data([unwrap(u[0]) for u in uniform_output])

    return PartFlatList(primary_column[0:max], uniform_output)
    def select(self, fields):
        if is_data(fields):
            fields = fields.value

        if is_text(fields):
            # RETURN LIST OF VALUES
            if len(split_field(fields)) == 1:
                if self.path[0] == fields:
                    return [d[1] for d in self.data]
                else:
                    return [d[0][fields] for d in self.data]
            else:
                keys = split_field(fields)
                depth = coalesce(
                    MIN([
                        i for i, (k, p) in enumerate(zip(keys, self.path))
                        if k != p
                    ]), len(self.path))  # LENGTH OF COMMON PREFIX
                short_key = keys[depth:]

                output = FlatList()
                _select1((to_data(d[depth]) for d in self.data), short_key, 0,
                         output)
                return output

        if is_list(fields):
            output = FlatList()

            meta = []
            for f in fields:
                if hasattr(f.value, "__call__"):
                    meta.append((f.name, f.value))
                else:
                    meta.append(
                        (f.name, functools.partial(lambda v, d: d[v],
                                                   f.value)))

            for row in self._values():
                agg = Data()
                for name, f in meta:
                    agg[name] = f(row)

                output.append(agg)

            return output

            # meta = []
            # for f in fields:
            #     keys = split_field(f.value)
            #     depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path))  # LENGTH OF COMMON PREFIX
            #     short_key = join_field(keys[depth:])
            #
            #     meta.append((f.name, depth, short_key))
            #
            # for row in self._data:
            #     agg = Data()
            #     for name, depth, short_key in meta:
            #         if short_key:
            #             agg[name] = row[depth][short_key]
            #         else:
            #             agg[name] = row[depth]
            #     output.append(agg)
            # return output

        Log.error("multiselect over FlatList not supported")
Ejemplo n.º 21
0
def request(method, url, headers=None, data=None, json=None, zip=None, retry=None, timeout=None, session=None, kwargs=None):
    """
    JUST LIKE requests.request() BUT WITH DEFAULT HEADERS AND FIXES
    DEMANDS data IS ONE OF:
    * A JSON-SERIALIZABLE STRUCTURE, OR
    * LIST OF JSON-SERIALIZABLE STRUCTURES, OR
    * None

    :param method: GET, POST, etc
    :param url: URL
    :param headers: dict OF HTTP REQUEST HEADERS
    :param data: BYTES (OR GENERATOR OF BYTES)
    :param json: JSON-SERIALIZABLE STRUCTURE
    :param zip: ZIP THE REQUEST BODY, IF BIG ENOUGH
    :param retry: {"times": x, "sleep": y} STRUCTURE
    :param timeout: SECONDS TO WAIT FOR RESPONSE
    :param session: Session OBJECT, IF YOU HAVE ONE
    :param kwargs: ALL PARAMETERS (DO NOT USE)
    :return:
    """
    global _warning_sent
    global request_count

    if not _warning_sent and not default_headers:
        Log.warning(text(
            "The mo_http.http module was meant to add extra " +
            "default headers to all requests, specifically the 'Referer' " +
            "header with a URL to the project. Use the `mo_logs.constants.set()` " +
            "function to set `mo_http.http.default_headers`"
        ))
    _warning_sent = True

    if is_list(url):
        # TRY MANY URLS
        failures = []
        for remaining, u in countdown(url):
            try:
                response = request(url=u, kwargs=kwargs)
                if mo_math.round(response.status_code, decimal=-2) not in [400, 500]:
                    return response
                if not remaining:
                    return response
            except Exception as e:
                e = Except.wrap(e)
                failures.append(e)
        Log.error(u"Tried {{num}} urls", num=len(url), cause=failures)

    if session:
        close_after_response = Null
    else:
        close_after_response = session = sessions.Session()

    with closing(close_after_response):
        if PY2 and is_text(url):
            # httplib.py WILL **FREAK OUT** IF IT SEES ANY UNICODE
            url = url.encode('ascii')

        try:
            set_default(kwargs, DEFAULTS)

            # HEADERS
            headers = unwrap(set_default(headers, session.headers, default_headers))
            _to_ascii_dict(headers)

            # RETRY
            retry = to_data(retry)
            if retry == None:
                retry = set_default({}, DEFAULTS['retry'])
            elif isinstance(retry, Number):
                retry = set_default({"times": retry}, DEFAULTS['retry'])
            elif isinstance(retry.sleep, Duration):
                retry.sleep = retry.sleep.seconds

            # JSON
            if json != None:
                data = value2json(json).encode('utf8')

            # ZIP
            zip = coalesce(zip, DEFAULTS['zip'])
            set_default(headers, {'Accept-Encoding': 'compress, gzip'})

            if zip:
                if is_sequence(data):
                    compressed = ibytes2icompressed(data)
                    headers['content-encoding'] = 'gzip'
                    data = compressed
                elif len(coalesce(data)) > 1000:
                    compressed = bytes2zip(data)
                    headers['content-encoding'] = 'gzip'
                    data = compressed
        except Exception as e:
            Log.error(u"Request setup failure on {{url}}", url=url, cause=e)

        errors = []
        for r in range(retry.times):
            if r:
                Till(seconds=retry.sleep).wait()

            try:
                request_count += 1
                with Timer(
                    "http {{method|upper}} to {{url}}",
                    param={"method": method, "url": text(url)},
                    verbose=DEBUG
                ):
                    return _session_request(session, url=str(url), headers=headers, data=data, json=None, kwargs=kwargs)
            except Exception as e:
                e = Except.wrap(e)
                if retry['http'] and str(url).startswith("https://") and "EOF occurred in violation of protocol" in e:
                    url = URL("http://" + str(url)[8:])
                    Log.note("Changed {{url}} to http due to SSL EOF violation.", url=str(url))
                errors.append(e)

        if " Read timed out." in errors[0]:
            Log.error(u"Tried {{times}} times: Timeout failure (timeout was {{timeout}}", timeout=timeout, times=retry.times, cause=errors[0])
        else:
            Log.error(u"Tried {{times}} times: Request failure of {{url}}", url=url, times=retry.times, cause=errors[0])
Ejemplo n.º 22
0
def _normalize_select(select, frum, schema=None):
    """
    :param select: ONE SELECT COLUMN
    :param frum: TABLE TO get_columns()
    :param schema: SCHEMA TO LOOKUP NAMES FOR DEFINITIONS
    :return: AN ARRAY OF SELECT COLUMNS
    """
    if is_text(select):
        canonical = select = Data(value=select)
    else:
        select = to_data(select)
        canonical = select.copy()

    canonical.aggregate = coalesce(canonical_aggregates[select.aggregate].name,
                                   select.aggregate, "none")
    canonical.default = coalesce(
        select.default, canonical_aggregates[canonical.aggregate].default)

    if hasattr(unwrap(frum), "_normalize_select"):
        return frum._normalize_select(canonical)

    output = []

    if len(select) and not select.value:
        Log.error(BAD_SELECT, select=select)
    elif not select.value or select.value == ".":
        output.extend([
            set_default(
                {
                    "name": c.name,
                    "value": jx_expression(c.name, schema=schema)
                }, canonical) for c in schema.leaves('.')
            # TOP LEVEL COLUMNS ONLY
            if len(c.nested_path) == 1
        ])
    elif is_text(select.value):
        if select.value.endswith(".*"):
            canonical.name = coalesce(select.name, ".")
            value = jx_expression(select[:-2], schema=schema)
            if not is_op(value, Variable):
                Log.error("`*` over general expression not supported yet")
                output.append([
                    set_default(
                        {
                            "value": LeavesOp(value, prefix=select.prefix),
                            "format": "dict"  # MARKUP FOR DECODING
                        },
                        canonical) for c in frum.get_columns()
                    if c.jx_type not in INTERNAL
                ])
            else:
                Log.error("do not know what to do")
        else:
            canonical.name = coalesce(select.name, select.value,
                                      select.aggregate)
            canonical.value = jx_expression(select.value, schema=schema)
            output.append(canonical)

    output = to_data(output)
    if any(n == None for n in output.name):
        Log.error("expecting select to have a name: {{select}}", select=select)
    return output
Ejemplo n.º 23
0
    def __init__(self, select, edges, data, frum=None):
        """
        data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE
        ALLOWED, USING THE select AND edges TO DESCRIBE THE data
        """

        self.is_value = False if is_list(select) else True
        self.select = select
        self.meta = Data(format="cube")  # PUT EXTRA MARKUP HERE
        self.is_none = False

        if not all(data.values()):
            is_none = True

        # ENSURE frum IS PROPER FORM
        if is_list(select):
            if edges and OR(not isinstance(v, Matrix) for v in data.values()):
                Log.error("Expecting data to be a dict with Matrix values")

        if not edges:
            if not data:
                if is_list(select):
                    Log.error("not expecting a list of records")

                data = {select.name: Matrix.ZERO}
                self.edges = Null
            elif is_data(data):
                # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA
                length = MAX([len(v) for v in data.values()])
                if length >= 1:
                    self.edges = list_to_data([{
                        "name": "rownum",
                        "domain": {
                            "type": "rownum"
                        }
                    }])
                else:
                    self.edges = Null
            elif is_list(data):
                if is_list(select):
                    Log.error("not expecting a list of records")

                data = {select.name: Matrix.wrap(data)}
                self.edges = list_to_data([{
                    "name": "rownum",
                    "domain": {
                        "type": "rownum",
                        "min": 0,
                        "max": len(data),
                        "interval": 1
                    }
                }])
            elif isinstance(data, Matrix):
                if is_list(select):
                    Log.error("not expecting a list of records")

                data = {select.name: data}
            else:
                if is_list(select):
                    Log.error("not expecting a list of records")

                data = {select.name: Matrix(value=data)}
                self.edges = Null
        else:
            self.edges = to_data(edges)

        self.data = data
Ejemplo n.º 24
0
def _normalize_edge(edge, dim_index, limit, schema=None):
    """
    :param edge: Not normalized edge
    :param dim_index: Dimensions are ordered; this is this edge's index into that order
    :param schema: for context
    :return: a normalized edge
    """
    if not edge:
        Log.error("Edge has no value, or expression is empty")
    elif is_text(edge):
        if schema:
            leaves = unwraplist(list(schema.leaves(edge)))
            if not leaves or is_container(leaves):
                return [
                    Data(name=edge,
                         value=jx_expression(edge, schema=schema),
                         allowNulls=True,
                         dim=dim_index,
                         domain=_normalize_domain(None, limit))
                ]
            elif isinstance(leaves, Column):
                return [
                    Data(name=edge,
                         value=jx_expression(edge, schema=schema),
                         allowNulls=True,
                         dim=dim_index,
                         domain=_normalize_domain(domain=leaves,
                                                  limit=limit,
                                                  schema=schema))
                ]
            elif is_list(leaves.fields) and len(leaves.fields) == 1:
                return [
                    Data(name=leaves.name,
                         value=jx_expression(leaves.fields[0], schema=schema),
                         allowNulls=True,
                         dim=dim_index,
                         domain=leaves.getDomain())
                ]
            else:
                return [
                    Data(name=leaves.name,
                         allowNulls=True,
                         dim=dim_index,
                         domain=leaves.getDomain())
                ]
        else:
            return [
                Data(name=edge,
                     value=jx_expression(edge, schema=schema),
                     allowNulls=True,
                     dim=dim_index,
                     domain=DefaultDomain())
            ]
    else:
        edge = to_data(edge)
        if not edge.name and not is_text(edge.value):
            Log.error("You must name compound and complex edges: {{edge}}",
                      edge=edge)

        if is_container(edge.value) and not edge.domain:
            # COMPLEX EDGE IS SHORT HAND
            domain = _normalize_domain(schema=schema)
            domain.dimension = Data(fields=edge.value)

            return [
                Data(name=edge.name,
                     value=jx_expression(edge.value, schema=schema),
                     allowNulls=bool(coalesce(edge.allowNulls, True)),
                     dim=dim_index,
                     domain=domain)
            ]

        domain = _normalize_domain(edge.domain, schema=schema)

        return [
            Data(name=coalesce(edge.name, edge.value),
                 value=jx_expression(edge.value, schema=schema),
                 range=_normalize_range(edge.range),
                 allowNulls=bool(coalesce(edge.allowNulls, True)),
                 dim=dim_index,
                 domain=domain)
        ]
def assertAlmostEqual(test,
                      expected,
                      digits=None,
                      places=None,
                      msg=None,
                      delta=None):
    show_detail = True
    test = unwrap(test)
    expected = unwrap(expected)
    try:
        if test is None and (is_null_op(expected) or expected is None):
            return
        elif test is expected:
            return
        elif is_text(expected):
            assertAlmostEqualValue(test,
                                   expected,
                                   msg=msg,
                                   digits=digits,
                                   places=places,
                                   delta=delta)
        elif isinstance(test, UniqueIndex):
            if test ^ expected:
                Log.error("Sets do not match")
        elif is_data(expected) and is_data(test):
            for k, e in unwrap(expected).items():
                t = test.get(k)
                assertAlmostEqual(t,
                                  e,
                                  msg=coalesce(msg, "") + "key " + quote(k) +
                                  ": ",
                                  digits=digits,
                                  places=places,
                                  delta=delta)
        elif is_data(expected):
            if is_many(test):
                test = list(test)
                if len(test) != 1:
                    Log.error("Expecting data, not a list")
                test = test[0]
            for k, e in expected.items():
                if is_text(k):
                    t = mo_dots.get_attr(test, literal_field(k))
                else:
                    t = test[k]
                assertAlmostEqual(t,
                                  e,
                                  msg=msg,
                                  digits=digits,
                                  places=places,
                                  delta=delta)
        elif is_container(test) and isinstance(expected, set):
            test = set(to_data(t) for t in test)
            if len(test) != len(expected):
                Log.error(
                    "Sets do not match, element count different:\n{{test|json|indent}}\nexpecting{{expectedtest|json|indent}}",
                    test=test,
                    expected=expected)

            for e in expected:
                for t in test:
                    try:
                        assertAlmostEqual(t,
                                          e,
                                          msg=msg,
                                          digits=digits,
                                          places=places,
                                          delta=delta)
                        break
                    except Exception as _:
                        pass
                else:
                    Log.error(
                        "Sets do not match. {{value|json}} not found in {{test|json}}",
                        value=e,
                        test=test)

        elif isinstance(expected, types.FunctionType):
            return expected(test)
        elif hasattr(test, "__iter__") and hasattr(expected, "__iter__"):
            if test.__class__.__name__ == "ndarray":  # numpy
                test = test.tolist()
            elif test.__class__.__name__ == "DataFrame":  # pandas
                test = test[test.columns[0]].values.tolist()
            elif test.__class__.__name__ == "Series":  # pandas
                test = test.values.tolist()

            if not expected and test == None:
                return
            if expected == None:
                expected = []  # REPRESENT NOTHING
            for t, e in zip_longest(test, expected):
                assertAlmostEqual(t,
                                  e,
                                  msg=msg,
                                  digits=digits,
                                  places=places,
                                  delta=delta)
        else:
            assertAlmostEqualValue(test,
                                   expected,
                                   msg=msg,
                                   digits=digits,
                                   places=places,
                                   delta=delta)
    except Exception as e:
        Log.error(
            "{{test|json|limit(10000)}} does not match expected {{expected|json|limit(10000)}}",
            test=test if show_detail else "[can not show]",
            expected=expected if show_detail else "[can not show]",
            cause=e)
Ejemplo n.º 26
0
def _normalize_group(edge, dim_index, limit, schema=None):
    """
    :param edge: Not normalized groupby
    :param dim_index: Dimensions are ordered; this is this groupby's index into that order
    :param schema: for context
    :return: a normalized groupby
    """
    if is_text(edge):
        if edge.endswith(".*"):
            prefix = edge[:-2]
            if schema:
                output = list_to_data([
                    {  # BECASUE THIS IS A GROUPBY, EARLY SPLIT INTO LEAVES WORKS JUST FINE
                        "name":
                        concat_field(
                            prefix,
                            literal_field(
                                relative_field(untype_path(c.name), prefix))),
                        "put": {
                            "name": literal_field(untype_path(c.name))
                        },
                        "value":
                        jx_expression(c.es_column, schema=schema),
                        "allowNulls":
                        True,
                        "domain": {
                            "type": "default"
                        }
                    } for c in schema.leaves(prefix)
                ])
                return output
            else:
                return list_to_data([{
                    "name": untype_path(prefix),
                    "put": {
                        "name": literal_field(untype_path(prefix))
                    },
                    "value": LeavesOp(Variable(prefix)),
                    "allowNulls": True,
                    "dim": dim_index,
                    "domain": {
                        "type": "default"
                    }
                }])

        return list_to_data([{
            "name": edge,
            "value": jx_expression(edge, schema=schema),
            "allowNulls": True,
            "dim": dim_index,
            "domain": Domain(type="default", limit=limit)
        }])
    else:
        edge = to_data(edge)
        if (edge.domain and edge.domain.type != "default"):
            Log.error("groupby does not accept complicated domains")

        if not edge.name and not is_text(edge.value):
            Log.error("You must name compound edges: {{edge}}", edge=edge)

        return list_to_data([{
            "name": coalesce(edge.name, edge.value),
            "value": jx_expression(edge.value, schema=schema),
            "allowNulls": True,
            "dim": dim_index,
            "domain": {
                "type": "default"
            }
        }])
 def _add_pending(self, delta):
     delta = to_data(delta)
     self.pending.append(delta)
Ejemplo n.º 28
0
    def update(self, command):
        """
        EXPECTING command == {"set":term, "where":where}
        THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES
        THE where CLAUSE IS AN ES FILTER
        """
        command = to_data(command)
        table = self.get_table(command['update'])

        es_index = self.es.cluster.get_index(read_only=False,
                                             alias=None,
                                             kwargs=self.es.settings)

        schema = table.schema

        # GET IDS OF DOCUMENTS
        query = {
            "from":
            command['update'],
            "select": [{
                "value": "_id"
            }] + [{
                "name": k,
                "value": v
            } for k, v in command.set.items()],
            "where":
            command.where,
            "format":
            "list",
            "limit":
            10000
        }

        results = self.query(query)

        if results.data:
            content = "".join(t for r in results.data
                              for _id, row in [(r._id, r)]
                              for _ in [row.__setitem__('_id', None)
                                        ]  # WARNING! DESTRUCTIVE TO row
                              for update in map(value2json, ({
                                  "update": {
                                      "_id": _id
                                  }
                              }, {
                                  "doc": row
                              })) for t in (update, "\n"))
            response = self.es.cluster.post(
                es_index.path + "/" + "_bulk",
                data=content,
                timeout=self.settings.timeout,
                params={
                    "wait_for_active_shards":
                    self.settings.wait_for_active_shards
                })
            if response.errors:
                Log.error("could not update: {{error}}",
                          error=[
                              e.error for i in response["items"]
                              for e in i.values() if e.status not in (200, 201)
                          ])

        # DELETE BY QUERY, IF NEEDED
        if "." in listwrap(command['clear']):
            es_filter = ES52Lang[jx_expression(
                command.where)].partial_eval().to_es(schema)
            self.es.delete_record(es_filter)
            return

        es_index.refresh()
Ejemplo n.º 29
0
def es_bulkaggsop(esq, frum, query):
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html#_filtering_values_with_partitions
    query = query.copy()  # WE WILL MARK UP THIS QUERY

    chunk_size = min(coalesce(query.chunk_size, MAX_CHUNK_SIZE),
                     MAX_CHUNK_SIZE)
    schema = frum.schema
    query_path = first(schema.query_path)
    selects = listwrap(query.select)

    variable = first(query.groupby).value
    # FIND CARDINALITY

    cardinality_check = Timer("Get cardinality for {{column}}",
                              param={"column": variable.var})

    with cardinality_check:
        columns = schema.leaves(variable.var)
        if len(columns) != 1:
            Log.error("too many columns to bulk groupby:\n{{columns|json}}",
                      columns=columns)
        column = first(columns)

        if query.where is TRUE:
            cardinality = column.cardinality
            if cardinality == None:
                esq.namespace._update_cardinality(column)
                cardinality = column.cardinality
        else:
            cardinality = esq.query({
                "select": {
                    "name": "card",
                    "value": variable,
                    "aggregate": "cardinality",
                },
                "from": frum.name,
                "where": query.where,
                "format": "cube",
            }).card

        num_partitions = (cardinality + chunk_size - 1) // chunk_size

        if num_partitions > MAX_PARTITIONS:
            Log.error("Requesting more than {{num}} partitions",
                      num=num_partitions)
        if num_partitions == 0:
            num_partitions = 1

        acc, decoders, es_query = aggop_to_es_queries(selects, query_path,
                                                      schema, query)
        guid = randoms.base64(32, extra="-_")
        abs_limit = mo_math.MIN(
            (query.limit, first(query.groupby).domain.limit))
        formatter = formatters[query.format](abs_limit)

        Thread.run(
            "extract to " + guid + ".json",
            extractor,
            guid,
            num_partitions,
            esq,
            query,
            selects,
            query_path,
            schema,
            chunk_size,
            cardinality,
            abs_limit,
            formatter,
            parent_thread=Null,
        ).release()

    output = to_data({
        "url": URL_PREFIX / (guid + ".json"),
        "status": URL_PREFIX / (guid + ".status.json"),
        "meta": {
            "format": query.format,
            "timing": {
                "cardinality_check": cardinality_check.duration
            },
            "es_query": es_query,
            "num_partitions": num_partitions,
            "cardinality": cardinality,
        },
    })
    return output
    def update(self, command):
        self.dirty = True
        try:
            command = to_data(command)
            DEBUG and Log.note(
                "Update {{timestamp}}: {{command|json}}",
                command=command,
                timestamp=Date(command["set"].last_updated),
            )
            eq = command.where.eq
            if eq.es_index:
                if len(eq) == 1:
                    if unwraplist(command.clear) == ".":
                        d = self.data
                        i = eq.es_index
                        with self.locker:
                            cols = d[i]
                            del d[i]

                        for c in cols:
                            self.remove(c)
                        return

                    # FASTEST
                    all_columns = self.data.get(eq.es_index, {}).values()
                    with self.locker:
                        columns = [c for cs in all_columns for c in cs]
                elif eq.es_column and len(eq) == 2:
                    # FASTER
                    all_columns = self.data.get(eq.es_index, {}).values()
                    with self.locker:
                        columns = [
                            c for cs in all_columns for c in cs
                            if c.es_column == eq.es_column
                        ]

                else:
                    # SLOWER
                    all_columns = self.data.get(eq.es_index, {}).values()
                    with self.locker:
                        columns = [
                            c for cs in all_columns for c in cs
                            if all(c[k] == v for k, v in
                                   eq.items())  # THIS LINE IS VERY SLOW
                        ]
            else:
                columns = list(self)
                columns = jx.filter(columns, command.where)

            with self.locker:
                for col in columns:
                    DEBUG and Log.note(
                        "update column {{table}}.{{column}}",
                        table=col.es_index,
                        column=col.es_column,
                    )
                    for k in command["clear"]:
                        if k == ".":
                            mark_as_deleted(col, Date.now())
                            self.for_es_update.add(col)
                            lst = self.data[col.es_index]
                            cols = lst[col.name]
                            cols.remove(col)
                            if len(cols) == 0:
                                del lst[col.name]
                                if len(lst) == 0:
                                    del self.data[col.es_index]
                            break
                        else:
                            col[k] = None
                    else:
                        # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES
                        for k, v in command.set.items():
                            col[k] = v
                        self.for_es_update.add(col)

        except Exception as e:
            Log.error("should not happen", cause=e)