Exemple #1
0
def value_compare(left, right, ordering=1):
    """
    SORT VALUES, NULL IS THE LEAST VALUE
    :param left: LHS
    :param right: RHS
    :param ordering: (-1, 0, 0) TO AFFECT SORT ORDER
    :return: The return value is negative if x < y, zero if x == y and strictly positive if x > y.
    """

    try:
        if isinstance(left, list) or isinstance(right, list):
            if left == None:
                return ordering
            elif right == None:
                return - ordering

            left = listwrap(left)
            right = listwrap(right)
            for a, b in zip(left, right):
                c = value_compare(a, b) * ordering
                if c != 0:
                    return c

            if len(left) < len(right):
                return - ordering
            elif len(left) > len(right):
                return ordering
            else:
                return 0

        ltype = type(left)
        rtype = type(right)
        ltype_num = TYPE_ORDER.get(ltype, 10)
        rtype_num = TYPE_ORDER.get(rtype, 10)
        type_diff = ltype_num - rtype_num
        if type_diff != 0:
            return ordering if type_diff > 0 else -ordering

        if ltype_num == 9:
            return 0
        elif ltype is builtin_tuple:
            for a, b in zip(left, right):
                c = value_compare(a, b)
                if c != 0:
                    return c * ordering
            return 0
        elif ltype in (dict, Data):
            for k in sorted(set(left.keys()) | set(right.keys())):
                c = value_compare(left.get(k), right.get(k)) * ordering
                if c != 0:
                    return c
            return 0
        elif left > right:
            return ordering
        elif left < right:
            return -ordering
        else:
            return 0
    except Exception as e:
        Log.error("Can not compare values {{left}} to {{right}}", left=left, right=right, cause=e)
Exemple #2
0
def argparse(defs):
    parser = _argparse.ArgumentParser()
    for d in listwrap(defs):
        args = d.copy()
        name = args.name
        args.name = None
        parser.add_argument(*unwrap(listwrap(name)), **args)
    namespace = parser.parse_args()
    output = {k: getattr(namespace, k) for k in vars(namespace)}
    return wrap(output)
Exemple #3
0
def argparse(defs):
    parser = _ArgParser()
    for d in listwrap(defs):
        args = d.copy()
        name = args.name
        args.name = None
        parser.add_argument(*unwrap(listwrap(name)), **args)
    namespace, unknown = parser.parse_known_args()
    if unknown:
        Log.warning("Ignoring arguments: {{unknown|json}}", unknown=unknown)
    output = {k: getattr(namespace, k) for k in vars(namespace)}
    return wrap(output)
Exemple #4
0
    def error(
        cls,
        template,  # human readable template
        default_params={},  # parameters for template
        cause=None,  # pausible cause
        stack_depth=0,
        **more_params
    ):
        """
        raise an exception with a trace for the cause too

        :param template: *string* human readable string with placeholders for parameters
        :param default_params: *dict* parameters to fill in template
        :param cause: *Exception* for chaining
        :param stack_depth:  *int* how many calls you want popped off the stack to report the *true* caller
        :param log_context: *dict* extra key:value pairs for your convenience
        :param more_params: *any more parameters (which will overwrite default_params)
        :return:
        """
        if not isinstance(template, text_type):
            sys.stderr.write(str("Log.error was expecting a unicode template"))
            Log.error("Log.error was expecting a unicode template")

        if default_params and isinstance(listwrap(default_params)[0], BaseException):
            cause = default_params
            default_params = {}

        params = dict(unwrap(default_params), **more_params)

        add_to_trace = False
        if cause == None:
            causes = None
        elif isinstance(cause, list):
            causes = []
            for c in listwrap(cause):  # CAN NOT USE LIST-COMPREHENSION IN PYTHON3 (EXTRA STACK DEPTH FROM THE IN-LINED GENERATOR)
                causes.append(Except.wrap(c, stack_depth=1))
            causes = FlatList(causes)
        elif isinstance(cause, BaseException):
            causes = Except.wrap(cause, stack_depth=1)
        else:
            causes = None
            Log.error("can only accept Exception, or list of exceptions")

        trace = exceptions.extract_stack(stack_depth + 1)

        if add_to_trace:
            cause[0].trace.extend(trace[1:])

        e = Except(type=exceptions.ERROR, template=template, params=params, cause=causes, trace=trace)
        raise_from_none(e)
Exemple #5
0
def value_compare(l, r, ordering=1):
    """
    SORT VALUES, NULL IS THE LEAST VALUE
    :param l: LHS
    :param r: RHS
    :param ordering: (-1, 0, 0) TO AFFECT SORT ORDER
    :return: The return value is negative if x < y, zero if x == y and strictly positive if x > y.
    """

    if l == None:
        if r == None:
            return 0
        else:
            return ordering
    elif r == None:
        return - ordering

    if isinstance(l, list) or isinstance(r, list):
        for a, b in zip(listwrap(l), listwrap(r)):
            c = value_compare(a, b) * ordering
            if c != 0:
                return c

        if len(l) < len(r):
            return - ordering
        elif len(l) > len(r):
            return ordering
        else:
            return 0
    elif isinstance(l, builtin_tuple) and isinstance(r, builtin_tuple):
        for a, b in zip(l, r):
            c = value_compare(a, b) * ordering
            if c != 0:
                return c
        return 0
    elif isinstance(l, Mapping):
        if isinstance(r, Mapping):
            for k in sorted(set(l.keys()) | set(r.keys())):
                c = value_compare(l.get(k), r.get(k)) * ordering
                if c != 0:
                    return c
            return 0
        else:
            return 1
    elif isinstance(r, Mapping):
        return -1
    else:
        return cmp(l, r) * ordering
Exemple #6
0
    def insert_new(self, table_name, candidate_key, new_record):
        candidate_key = listwrap(candidate_key)

        condition = SQL_AND.join([
            quote_column(k) + "=" + quote_value(new_record[k])
            if new_record[k] != None
            else quote_column(k) + SQL_IS_NULL
            for k in candidate_key
        ])
        command = (
            "INSERT INTO " + quote_column(table_name) + sql_iso(sql_list(
                quote_column(k) for k in new_record.keys()
            )) +
            SQL_SELECT + "a.*" + SQL_FROM + sql_iso(
                SQL_SELECT + sql_list([quote_value(v) + " " + quote_column(k) for k, v in new_record.items()]) +
                SQL_FROM + "DUAL"
            ) + " a" +
            SQL_LEFT_JOIN + sql_iso(
                SQL_SELECT + "'dummy' exist " +
                SQL_FROM + quote_column(table_name) +
                SQL_WHERE + condition +
                SQL_LIMIT + SQL_ONE
            ) + " b ON " + SQL_TRUE + SQL_WHERE + " exist " + SQL_IS_NULL
        )
        self.execute(command, {})
    def select(self, select):
        selects = listwrap(select)

        if len(selects) == 1 and is_op(selects[0].value, Variable) and selects[0].value.var == ".":
            new_schema = self.schema
            if selects[0].name == ".":
                return self
        else:
            new_schema = None

        if is_list(select):
            if all(
                is_op(s.value, Variable) and s.name == s.value.var
                for s in select
            ):
                names = set(s.value.var for s in select)
                new_schema = Schema(".", [c for c in self.schema.columns if c.name in names])

            push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects]
            def selector(d):
                output = Data()
                for n, p in push_and_pull:
                    output[n] = unwraplist(p(wrap(d)))
                return unwrap(output)

            new_data = map(selector, self.data)
        else:
            select_value = jx_expression_to_function(select.value)
            new_data = map(select_value, self.data)
            if is_op(select.value, Variable):
                column = copy(first(c for c in self.schema.columns if c.name == select.value.var))
                column.name = '.'
                new_schema = Schema("from " + self.name, [column])

        return ListContainer("from "+self.name, data=new_data, schema=new_schema)
Exemple #8
0
    def _send_email(self):
        try:
            if not self.accumulation:
                return
            with Emailer(self.settings) as emailer:
                # WHO ARE WE SENDING TO
                emails = Data()
                for template, params in self.accumulation:
                    content = expand_template(template, params)
                    emails[literal_field(self.settings.to_address)] += [content]
                    for c in self.cc:
                        if any(d in params.params.error for d in c.contains):
                            emails[literal_field(c.to_address)] += [content]

                # SEND TO EACH
                for to_address, content in emails.items():
                    emailer.send_email(
                        from_address=self.settings.from_address,
                        to_address=listwrap(to_address),
                        subject=self.settings.subject,
                        text_data="\n\n".join(content)
                    )

            self.accumulation = []
        except Exception as e:
            Log.warning("Could not send", e)
        finally:
            self.next_send = Date.now() + self.settings.average_interval * (2 * Random.float())
Exemple #9
0
def _normalize_groupby(groupby, limit, schema=None):
    if groupby == None:
        return None
    output = wrap([n for ie, e in enumerate(listwrap(groupby)) for n in _normalize_group(e, ie, limit, schema=schema) ])
    if any(o==None for o in output):
        Log.error("not expected")
    return output
Exemple #10
0
    def wrap(cls, e, stack_depth=0):
        """
        ENSURE THE STACKTRACE AND CAUSAL CHAIN IS CAPTURED, PLUS ADD FEATURES OF Except

        :param e: AN EXCEPTION OF ANY TYPE
        :param stack_depth: HOW MANY CALLS TO TAKE OFF THE TOP OF THE STACK TRACE
        :return: A Except OBJECT OF THE SAME
        """
        if e == None:
            return Null
        elif isinstance(e, (list, Except)):
            return e
        elif is_data(e):
            e.cause = unwraplist([Except.wrap(c) for c in listwrap(e.cause)])
            return Except(**e)
        else:
            tb = getattr(e, '__traceback__', None)
            if tb is not None:
                trace = _parse_traceback(tb)
            else:
                trace = _extract_traceback(0)

            cause = Except.wrap(getattr(e, '__cause__', None))
            if hasattr(e, "message") and e.message:
                output = Except(context=ERROR, template=text_type(e.message), trace=trace, cause=cause)
            else:
                output = Except(context=ERROR, template=text_type(e), trace=trace, cause=cause)

            trace = extract_stack(stack_depth + 2)  # +2 = to remove the caller, and it's call to this' Except.wrap()
            output.trace.extend(trace)
            return output
Exemple #11
0
    def query(self, _query):
        try:
            query = QueryOp.wrap(_query, container=self, namespace=self.namespace)

            for s in listwrap(query.select):
                if s.aggregate != None and not aggregates.get(s.aggregate):
                    Log.error(
                        "ES can not aggregate {{name}} because {{aggregate|quote}} is not a recognized aggregate",
                        name=s.name,
                        aggregate=s.aggregate
                    )

            frum = query["from"]
            if isinstance(frum, QueryOp):
                result = self.query(frum)
                q2 = query.copy()
                q2.frum = result
                return jx.run(q2)

            if is_deepop(self.es, query):
                return es_deepop(self.es, query)
            if is_aggsop(self.es, query):
                return es_aggsop(self.es, frum, query)
            if is_setop(self.es, query):
                return es_setop(self.es, query)
            Log.error("Can not handle")
        except Exception as e:
            e = Except.wrap(e)
            if "Data too large, data for" in e:
                http.post(self.es.cluster.url / "_cache/clear")
                Log.error("Problem (Tried to clear Elasticsearch cache)", e)
            Log.error("problem", e)
Exemple #12
0
    def __init__(self, **desc):
        Domain.__init__(self, **desc)
        self.type = "range"
        self.NULL = Null

        if self.partitions:
            # IGNORE THE min, max, interval
            if not self.key:
                Log.error("Must have a key value")

            parts = listwrap(self.partitions)
            for i, p in enumerate(parts):
                self.min = MIN([self.min, p.min])
                self.max = MAX([self.max, p.max])
                if p.dataIndex != None and p.dataIndex != i:
                    Log.error("Expecting `dataIndex` to agree with the order of the parts")
                if p[self.key] == None:
                    Log.error("Expecting all parts to have {{key}} as a property", key=self.key)
                p.dataIndex = i

            # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE
            for p, q in itertools.product(parts, parts):
                if p.min <= q.min and q.min < p.max:
                    Log.error("partitions overlap!")

            self.partitions = parts
            return
        elif any([self.min == None, self.max == None, self.interval == None]):
            Log.error("Can not handle missing parameter")

        self.key = "min"
        self.partitions = wrap([{"min": v, "max": v + self.interval, "dataIndex": i} for i, v in enumerate(frange(self.min, self.max, self.interval))])
Exemple #13
0
def read_settings(filename=None, defs=None):
    # READ SETTINGS
    if filename:
        settings_file = File(filename)
        if not settings_file.exists:
            Log.error("Can not file settings file {{filename}}", {
                "filename": settings_file.abspath
            })
        settings = mo_json_config.get("file:///" + settings_file.abspath)
        if defs:
            settings.args = argparse(defs)
        return settings
    else:
        defs = listwrap(defs)
        defs.append({
            "name": ["--settings", "--settings-file", "--settings_file"],
            "help": "path to JSON file with settings",
            "type": str,
            "dest": "filename",
            "default": "./settings.json",
            "required": False
        })
        args = argparse(defs)
        settings = mo_json_config.get("file://" + args.filename.replace(os.sep, "/"))
        settings.args = args
        return settings
Exemple #14
0
def read_settings(filename=None, defs=None):
    """
    :param filename: Force load a file
    :param defs: arguments you want to accept
    :param default_filename: A config file from an environment variable (a fallback config file, if no other provided)
    :return:
    """
    # READ SETTINGS
    defs = listwrap(defs)
    defs.append({
        "name": ["--config", "--settings", "--settings-file", "--settings_file"],
        "help": "path to JSON file with settings",
        "type": str,
        "dest": "filename",
        "default": None,
        "required": False
    })
    args = argparse(defs)

    args.filename = coalesce(filename, args.filename, "./config.json")
    settings_file = File(args.filename)
    if not settings_file.exists:
        Log.error("Can not read configuration file {{filename}}", {
            "filename": settings_file.abspath
        })
    settings = mo_json_config.get_file(settings_file)
    settings.args = args
    return settings
Exemple #15
0
 def __getitem__(self, item):
     for s in listwrap(self.cube.select):
         if s.name == item:
             return self.cube.data[item]
     for i, e in enumerate(self.cube.edges):
         if e.name == item:
             return e.domain.partition[self.coord[i]]
    def _send_email(self):
        try:
            if not self.accumulation:
                return
            with Closer(connect_to_region(
                self.settings.region,
                aws_access_key_id=unwrap(self.settings.aws_access_key_id),
                aws_secret_access_key=unwrap(self.settings.aws_secret_access_key)
            )) as conn:

                # WHO ARE WE SENDING TO
                emails = Data()
                for template, params in self.accumulation:
                    content = expand_template(template, params)
                    emails[literal_field(self.settings.to_address)] += [content]
                    for c in self.cc:
                        if any(c in params.params.error for c in c.contains):
                            emails[literal_field(c.to_address)] += [content]

                # SEND TO EACH
                for to_address, content in emails.items():
                    conn.send_email(
                        source=self.settings.from_address,
                        to_addresses=listwrap(to_address),
                        subject=self.settings.subject,
                        body="\n\n".join(content),
                        format="text"
                    )

            self.next_send = Date.now() + self.settings.max_interval
            self.accumulation = []
        except Exception as e:
            self.next_send = Date.now() + self.settings.max_interval
            Log.warning("Could not send", e)
Exemple #17
0
def is_terms(query):
    select = listwrap(query.select)

    isSimple = not query.select or AND(aggregates[s.aggregate] in ("none", "count") for s in select)
    if isSimple:
        return True
    return False
    def __init__(
        self,
        host,
        index,
        port=9200,
        type="log",
        queue_size=1000,
        batch_size=100,
        kwargs=None,
    ):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds
        kwargs.retry.times = coalesce(kwargs.retry.times, 3)
        kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds
        kwargs.host = Random.sample(listwrap(host), 1)[0]

        schema = json2value(value2json(SCHEMA), leaves=True)
        schema.mappings[type].properties["~N~"].type = "nested"
        self.es = Cluster(kwargs).get_or_create_index(
            schema=schema,
            limit_replicas=True,
            typed=True,
            kwargs=kwargs,
        )
        self.batch_size = batch_size
        self.es.add_alias(coalesce(kwargs.alias, kwargs.index))
        self.queue = Queue("debug logs to es", max=queue_size, silent=True)

        self.worker = Thread.run("add debug logs to es", self._insert_loop)
Exemple #19
0
    def vars(self, exclude_where=False, exclude_select=False):
        """
        :return: variables in query
        """
        def edges_get_all_vars(e):
            output = set()
            if isinstance(e.value, text_type):
                output.add(e.value)
            if isinstance(e.value, Expression):
                output |= e.value.vars()
            if e.domain.key:
                output.add(e.domain.key)
            if e.domain.where:
                output |= e.domain.where.vars()
            if e.range:
                output |= e.range.min.vars()
                output |= e.range.max.vars()
            if e.domain.partitions:
                for p in e.domain.partitions:
                    if p.where:
                        output |= p.where.vars()
            return output

        output = set()
        try:
            output |= self.frum.vars()
        except Exception:
            pass

        if not exclude_select:
            for s in listwrap(self.select):
                output |= s.value.vars()
        for s in listwrap(self.edges):
            output |= edges_get_all_vars(s)
        for s in listwrap(self.groupby):
            output |= edges_get_all_vars(s)
        if not exclude_where:
            output |= self.where.vars()
        for s in listwrap(self.sort):
            output |= s.value.vars()

        try:
            output |= UNION(e.vars() for e in self.window)
        except Exception:
            pass

        return output
Exemple #20
0
 def new_instance(desc):
     return Except(
         context=desc.context,
         template=desc.template,
         params=desc.params,
         cause=[Except.new_instance(c) for c in listwrap(desc.cause)],
         trace=desc.trace
     )
Exemple #21
0
def _get_schema_from_list(frum, table_name, prefix_path, nested_path, columns):
    """
    :param frum: The list
    :param table_name: Name of the table this list holds records for
    :param prefix_path: parent path
    :param nested_path: each nested array, in reverse order
    :param columns: map from full name to column definition
    :return:
    """

    for d in frum:
        row_type = _type_to_name[d.__class__]
        if row_type != "object":
            full_name = join_field(prefix_path)
            column = columns[full_name]
            if not column:
                column = Column(
                    names={table_name: full_name},
                    es_column=full_name,
                    es_index=".",
                    type="undefined",
                    nested_path=nested_path
                )
                columns.add(column)
            column.type = _merge_type[column.type][row_type]
        else:
            for name, value in d.items():
                full_name = join_field(prefix_path + [name])
                column = columns[full_name]
                if not column:
                    column = Column(
                        names={table_name: full_name},
                        es_column=full_name,
                        es_index=".",
                        type="undefined",
                        nested_path=nested_path
                    )
                    columns.add(column)
                if isinstance(value, list):
                    if len(value) == 0:
                        this_type = "undefined"
                    elif len(value) == 1:
                        this_type = _type_to_name[value[0].__class__]
                    else:
                        this_type = _type_to_name[value[0].__class__]
                        if this_type == "object":
                            this_type = "nested"
                else:
                    this_type = _type_to_name[value.__class__]
                new_type = _merge_type[column.type][this_type]
                column.type = new_type

                if this_type == "object":
                    _get_schema_from_list([value], table_name, prefix_path + [name], nested_path, columns)
                elif this_type == "nested":
                    np = listwrap(nested_path)
                    newpath = unwraplist([join_field(split_field(np[0])+[name])]+np)
                    _get_schema_from_list(value, table_name, prefix_path + [name], newpath, columns)
Exemple #22
0
    def fatal(
        cls,
        template,  # human readable template
        default_params={},  # parameters for template
        cause=None,  # pausible cause
        stack_depth=0,
        log_context=None,
        **more_params
    ):
        """
        SEND TO STDERR

        :param template: *string* human readable string with placeholders for parameters
        :param default_params: *dict* parameters to fill in template
        :param cause: *Exception* for chaining
        :param stack_depth:  *int* how many calls you want popped off the stack to report the *true* caller
        :param log_context: *dict* extra key:value pairs for your convenience
        :param more_params: *any more parameters (which will overwrite default_params)
        :return:
        """
        if default_params and isinstance(listwrap(default_params)[0], BaseException):
            cause = default_params
            default_params = {}

        params = dict(unwrap(default_params), **more_params)

        cause = unwraplist([Except.wrap(c) for c in listwrap(cause)])
        trace = exceptions.extract_stack(stack_depth + 1)

        e = Except(exceptions.ERROR, template, params, cause, trace)
        str_e = unicode(e)

        error_mode = cls.error_mode
        with suppress_exception:
            if not error_mode:
                cls.error_mode = True
                Log.note(
                    "{{error|unicode}}",
                    error=e,
                    log_context=set_default({"context": exceptions.FATAL}, log_context),
                    stack_depth=stack_depth + 1
                )
        cls.error_mode = error_mode

        sys.stderr.write(str_e.encode('utf8'))
Exemple #23
0
    def start(cls, settings=None):
        """
        RUN ME FIRST TO SETUP THE THREADED LOGGING
        http://victorlin.me/2012/08/good-logging-practice-in-python/

        log       - LIST OF PARAMETERS FOR LOGGER(S)
        trace     - SHOW MORE DETAILS IN EVERY LOG LINE (default False)
        cprofile  - True==ENABLE THE C-PROFILER THAT COMES WITH PYTHON (default False)
                    USE THE LONG FORM TO SET THE FILENAME {"enabled": True, "filename": "cprofile.tab"}
        profile   - True==ENABLE pyLibrary SIMPLE PROFILING (default False) (eg with Profiler("some description"):)
                    USE THE LONG FORM TO SET FILENAME {"enabled": True, "filename": "profile.tab"}
        constants - UPDATE MODULE CONSTANTS AT STARTUP (PRIMARILY INTENDED TO CHANGE DEBUG STATE)
        """
        global _Thread
        if not settings:
            return
        settings = wrap(settings)

        Log.stop()

        cls.settings = settings
        cls.trace = coalesce(settings.trace, False)
        if cls.trace:
            from mo_threads import Thread as _Thread
            _ = _Thread

        # ENABLE CPROFILE
        if settings.cprofile is False:
            settings.cprofile = {"enabled": False}
        elif settings.cprofile is True:
            if isinstance(settings.cprofile, bool):
                settings.cprofile = {"enabled": True, "filename": "cprofile.tab"}
        if settings.cprofile.enabled:
            from mo_threads import profiles
            profiles.enable_profilers(settings.cprofile.filename)

        if settings.profile is True or (isinstance(settings.profile, Mapping) and settings.profile.enabled):
            Log.error("REMOVED 2018-09-02, Activedata revision 3f30ff46f5971776f8ba18")
            # from mo_logs import profiles
            #
            # if isinstance(settings.profile, bool):
            #     profiles.ON = True
            #     settings.profile = {"enabled": True, "filename": "profile.tab"}
            #
            # if settings.profile.enabled:
            #     profiles.ON = True

        if settings.constants:
            constants.set(settings.constants)

        if settings.log:
            cls.logging_multi = StructuredLogger_usingMulti()
            for log in listwrap(settings.log):
                Log.add_log(Log.new_instance(log))

            from mo_logs.log_usingThread import StructuredLogger_usingThread
            cls.main_log = StructuredLogger_usingThread(cls.logging_multi)
Exemple #24
0
 def _select(self, select):
     selects = listwrap(select)
     is_aggregate = OR(s.aggregate != None and s.aggregate != "none" for s in selects)
     if is_aggregate:
         values = {s.name: Matrix(value=self.data[s.value].aggregate(s.aggregate)) for s in selects}
         return Cube(select, [], values)
     else:
         values = {s.name: self.data[s.value] for s in selects}
         return Cube(select, self.edges, values)
        def errors(e, _buffer):  # HANDLE ERRORS FROM extend()
            if e.cause.cause:
                not_possible = [f for f in listwrap(e.cause.cause) if any(h in f for h in HOPELESS)]
                still_have_hope = [f for f in listwrap(e.cause.cause) if all(h not in f for h in HOPELESS)]
            else:
                not_possible = [e]
                still_have_hope = []

            if still_have_hope:
                if "429 EsRejectedExecutionException[rejected execution (queue capacity" in e:
                    Log.note("waiting for ES to be free ({{num}} pending)", num=len(_buffer))
                elif "503 UnavailableShardsException" in e:
                    Log.note("waiting for ES to initialize shards ({{num}} pending)", num=len(_buffer))
                else:
                    Log.warning("Problem with sending to ES ({{num}} pending)", num=len(_buffer), cause=still_have_hope)
            elif not_possible:
                # THERE IS NOTHING WE CAN DO
                Log.warning("Not inserted, will not try again", cause=not_possible[0:10:])
                del _buffer[:]
Exemple #26
0
 def _convert_window(self, window):
     return Data(
         name=coalesce(window.name, window.value),
         value=window.value,
         edges=[self._convert_edge(e) for e in listwrap(window.edges)],
         sort=self._convert_sort(window.sort),
         aggregate=window.aggregate,
         range=self._convert_range(window.range),
         where=self._convert_where(window.where)
     )
Exemple #27
0
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False):
    """
    :param data:
    :param keys:
    :param size:
    :param min_size:
    :param max_size:
    :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES
    :return: return list of (keys, values) PAIRS, WHERE
                 keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR
                 values IS GENERATOR OF ALL VALUE THAT MATCH keys
        contiguous -
    """
    if isinstance(data, Container):
        return data.groupby(keys)

    if size != None or min_size != None or max_size != None:
        if size != None:
            max_size = size
        return groupby_min_max_size(data, min_size=min_size, max_size=max_size)

    try:
        keys = listwrap(keys)
        if not contiguous:
            from jx_python import jx
            data = jx.sort(data, keys)

        if not data:
            return Null

        if any(isinstance(k, Expression) for k in keys):
            Log.error("can not handle expressions")
        else:
            accessor = jx_expression_to_function(jx_expression({"tuple": keys}))  # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__

        def _output():
            start = 0
            prev = accessor(data[0])
            for i, d in enumerate(data):
                curr = accessor(d)
                if curr != prev:
                    group = {}
                    for k, gg in zip(keys, prev):
                        group[k] = gg
                    yield Data(group), data[start:i:]
                    start = i
                    prev = curr
            group = {}
            for k, gg in zip(keys, prev):
                group[k] = gg
            yield Data(group), data[start::]

        return _output()
    except Exception as e:
        Log.error("Problem grouping", cause=e)
    def _request_spot_instances(self, price, availability_zone_group, instance_type, kwargs):
        kwargs.kwargs = None

        # m3 INSTANCES ARE NOT ALLOWED PLACEMENT GROUP
        if instance_type.startswith("m3."):
            kwargs.placement_group = None

        kwargs.network_interfaces = NetworkInterfaceCollection(*(
            NetworkInterfaceSpecification(**i)
            for i in listwrap(kwargs.network_interfaces)
            if self.vpc_conn.get_all_subnets(subnet_ids=i.subnet_id, filters={"availabilityZone": availability_zone_group})
        ))

        if len(kwargs.network_interfaces) == 0:
            Log.error("No network interface specifications found for {{availability_zone}}!", availability_zone=kwargs.availability_zone_group)

        block_device_map = BlockDeviceMapping()

        # GENERIC BLOCK DEVICE MAPPING
        for dev, dev_settings in kwargs.block_device_map.items():
            block_device_map[dev] = BlockDeviceType(
                delete_on_termination=True,
                **dev_settings
            )

        kwargs.block_device_map = block_device_map

        # INCLUDE EPHEMERAL STORAGE IN BlockDeviceMapping
        num_ephemeral_volumes = ephemeral_storage[instance_type]["num"]
        for i in range(num_ephemeral_volumes):
            letter = convert.ascii2char(98 + i)  # START AT "b"
            kwargs.block_device_map["/dev/sd" + letter] = BlockDeviceType(
                ephemeral_name='ephemeral' + unicode(i),
                delete_on_termination=True
            )

        if kwargs.expiration:
            kwargs.valid_until = (Date.now() + Duration(kwargs.expiration)).format(ISO8601)
            kwargs.expiration = None

        # ATTACH NEW EBS VOLUMES
        for i, drive in enumerate(self.settings.utility[instance_type].drives):
            letter = convert.ascii2char(98 + i + num_ephemeral_volumes)
            device = drive.device = coalesce(drive.device, "/dev/sd" + letter)
            d = drive.copy()
            d.path = None  # path AND device PROPERTY IS NOT ALLOWED IN THE BlockDeviceType
            d.device = None
            if d.size:
                kwargs.block_device_map[device] = BlockDeviceType(
                    delete_on_termination=True,
                    **d
                )

        output = list(self.ec2_conn.request_spot_instances(**kwargs))
        return output
Exemple #29
0
    def __contains__(self, value):
        if isinstance(value, text_type):
            if self.template.find(value) >= 0 or self.message.find(value) >= 0:
                return True

        if self.type == value:
            return True
        for c in listwrap(self.cause):
            if value in c:
                return True
        return False
Exemple #30
0
    def __contains__(self, value):
        if is_text(value):
            if self.template.find(value) >= 0 or self.message.find(value) >= 0:
                return True

        if self.context == value:
            return True
        for c in listwrap(self.cause):
            if value in c:
                return True
        return False
Exemple #31
0
def _get_schema_from_list(
    frum,  # The list
    table_name,  # Name of the table this list holds records for
    parent,  # parent path
    nested_path,  # each nested array, in reverse order
    columns,  # map from full name to column definition
    native_type_to_json_type,  # dict from storage type name to json type name
):
    for d in frum:
        row_type = python_type_to_json_type[d.__class__]

        if row_type != "object":
            # EXPECTING PRIMITIVE VALUE
            full_name = parent
            column = columns[full_name]
            if not column:
                es_type = d.__class__
                jx_type =native_type_to_json_type[es_type]

                column = Column(
                    name=concat_field(table_name, json_type_to_inserter_type[jx_type]),
                    es_column=full_name,
                    es_index=".",
                    es_type=es_type,
                    jx_type=jx_type,
                    last_updated=Date.now(),
                    nested_path=nested_path,
                    multi=1,
                )
                columns.add(column)
            else:
                column.es_type = _merge_python_type(column.es_type, d.__class__)
                column.jx_type = native_type_to_json_type[column.es_type]
        else:
            for name, value in d.items():
                full_name = concat_field(parent, name)
                column = columns[full_name]

                if is_container(value):  # GET TYPE OF MULTIVALUE
                    v = list(value)
                    if len(v) == 0:
                        es_type = none_type.__name__
                    elif len(v) == 1:
                        es_type = v[0].__class__.__name__
                    else:
                        es_type = reduce(
                            _merge_python_type, (vi.__class__.__name__ for vi in value)
                        )
                else:
                    es_type = value.__class__.__name__

                if not column:
                    jx_type = native_type_to_json_type[es_type]
                    column = Column(
                        name=concat_field(table_name, full_name),
                        es_column=full_name,
                        es_index=".",
                        es_type=es_type,
                        jx_type=jx_type,
                        last_updated=Date.now(),
                        nested_path=nested_path,
                        cardinality=1 if jx_type == OBJECT else None,
                        multi=1
                    )
                    columns.add(column)
                else:
                    column.es_type = _merge_python_type(column.es_type, es_type)
                    try:
                        column.jx_type = native_type_to_json_type[column.es_type]
                    except Exception as e:
                        raise e

                if es_type in {"object", "dict", "Mapping", "Data"}:
                    _get_schema_from_list(
                        [value],
                        table_name,
                        full_name,
                        nested_path,
                        columns,
                        native_type_to_json_type,
                    )
                elif es_type in {"list", "FlatList"}:
                    np = listwrap(nested_path)
                    newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np)
                    _get_schema_from_list(
                        value, table_name, full_name, newpath, columns
                    )
Exemple #32
0
    def update(self, command):
        """
        EXPECTING command == {"set":term, "where":where}
        THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES
        THE where CLAUSE IS AN ES FILTER
        """
        command = wrap(command)
        schema = self._es.get_properties()

        # GET IDS OF DOCUMENTS
        results = self._es.search({
            "fields": listwrap(schema._routing.path),
            "query": {
                "filtered": {
                    "filter": jx_expression(command.where).to_esfilter(Null)
                }
            },
            "size": 10000
        })

        # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT)
        scripts = FlatList()
        for k, v in command.set.items():
            if not is_variable_name(k):
                Log.error("Only support simple paths for now")
            if isinstance(v, Mapping) and v.doc:
                scripts.append({"doc": v.doc})
            else:
                v = scrub(v)
                scripts.append({
                    "script":
                    "ctx._source." + k + " = " +
                    jx_expression(v).to_ruby(schema).script(schema)
                })

        if results.hits.hits:
            updates = []
            for h in results.hits.hits:
                for s in scripts:
                    updates.append({
                        "update": {
                            "_id":
                            h._id,
                            "_routing":
                            unwraplist(h.fields[literal_field(
                                schema._routing.path)])
                        }
                    })
                    updates.append(s)
            content = ("\n".join(convert.value2json(c)
                                 for c in updates) + "\n").encode('utf-8')
            response = self._es.cluster.post(
                self._es.path + "/_bulk",
                data=content,
                headers={"Content-Type": "application/json"},
                timeout=self.settings.timeout,
                params={
                    "wait_for_active_shards":
                    self.settings.wait_for_active_shards
                })
            if response.errors:
                Log.error("could not update: {{error}}",
                          error=[
                              e.error for i in response["items"]
                              for e in i.values() if e.status not in (200, 201)
                          ])
Exemple #33
0
def to_sql(self, schema, not_null=False, boolean=False):
    if self.var == GUID:
        return wrap([{
            "name": ".",
            "sql": {
                "s": quoted_GUID
            },
            "nested_path": ROOT_PATH
        }])
    vars = schema[self.var]
    if not vars:
        # DOES NOT EXIST
        return wrap([{
            "name": ".",
            "sql": {
                "0": SQL_NULL
            },
            "nested_path": ROOT_PATH
        }])
    var_name = list(set(listwrap(vars).names.get('\\.')))
    if len(var_name) > 1:
        Log.error("do not know how to handle")
    var_name = var_name[0]
    cols = schema.leaves(self.var)
    acc = {}
    if boolean:
        for col in cols:
            cname = relative_field(col.names['.'], var_name)
            nested_path = col.nested_path[0]
            if col.type == OBJECT:
                value = SQL_TRUE
            elif col.type == BOOLEAN:
                value = quote_column(col.es_column)
            else:
                value = quote_column(col.es_column) + SQL_IS_NOT_NULL
            tempa = acc.setdefault(nested_path, {})
            tempb = tempa.setdefault(get_property_name(cname), {})
            tempb['b'] = value
    else:
        for col in cols:
            cname = relative_field(col.names['.'], var_name)
            if col.type == OBJECT:
                prefix = self.var + "."
                for cn, cs in schema.items():
                    if cn.startswith(prefix):
                        for child_col in cs:
                            tempa = acc.setdefault(child_col.nested_path[0],
                                                   {})
                            tempb = tempa.setdefault(get_property_name(cname),
                                                     {})
                            tempb[json_type_to_sql_type[
                                col.type]] = quote_column(child_col.es_column)
            else:
                nested_path = col.nested_path[0]
                tempa = acc.setdefault(nested_path, {})
                tempb = tempa.setdefault(get_property_name(cname), {})
                tempb[json_type_to_sql_type[col.type]] = quote_column(
                    col.es_column)

    return wrap([{
        "name": cname,
        "sql": types,
        "nested_path": nested_path
    } for nested_path, pairs in acc.items() for cname, types in pairs.items()])
 def columns(self):
     return listwrap(self.select) + coalesce(self.edges, self.groupby)
Exemple #35
0
    def start(cls, settings=None):
        """
        RUN ME FIRST TO SETUP THE THREADED LOGGING
        http://victorlin.me/2012/08/good-logging-practice-in-python/

        log       - LIST OF PARAMETERS FOR LOGGER(S)
        trace     - SHOW MORE DETAILS IN EVERY LOG LINE (default False)
        cprofile  - True==ENABLE THE C-PROFILER THAT COMES WITH PYTHON (default False)
                    USE THE LONG FORM TO SET THE FILENAME {"enabled": True, "filename": "cprofile.tab"}
        profile   - True==ENABLE pyLibrary SIMPLE PROFILING (default False) (eg with Profiler("some description"):)
                    USE THE LONG FORM TO SET FILENAME {"enabled": True, "filename": "profile.tab"}
        constants - UPDATE MODULE CONSTANTS AT STARTUP (PRIMARILY INTENDED TO CHANGE DEBUG STATE)
        """
        global _Thread
        if not settings:
            return
        settings = wrap(settings)

        Log.stop()

        cls.settings = settings
        cls.trace = coalesce(settings.trace, False)
        if cls.trace:
            from mo_threads import Thread as _Thread
            _ = _Thread

        # ENABLE CPROFILE
        if settings.cprofile is False:
            settings.cprofile = {"enabled": False}
        elif settings.cprofile is True:
            if isinstance(settings.cprofile, bool):
                settings.cprofile = {
                    "enabled": True,
                    "filename": "cprofile.tab"
                }
        if settings.cprofile.enabled:
            from mo_threads import profiles
            profiles.enable_profilers(settings.cprofile.filename)

        if settings.profile is True or (is_data(settings.profile)
                                        and settings.profile.enabled):
            Log.error(
                "REMOVED 2018-09-02, Activedata revision 3f30ff46f5971776f8ba18"
            )
            # from mo_logs import profiles
            #
            # if isinstance(settings.profile, bool):
            #     profiles.ON = True
            #     settings.profile = {"enabled": True, "filename": "profile.tab"}
            #
            # if settings.profile.enabled:
            #     profiles.ON = True

        if settings.constants:
            constants.set(settings.constants)

        if settings.log:
            cls.logging_multi = StructuredLogger_usingMulti()
            for log in listwrap(settings.log):
                Log.add_log(Log.new_instance(log))

            from mo_logs.log_usingThread import StructuredLogger_usingThread
            cls.main_log = StructuredLogger_usingThread(cls.logging_multi)
Exemple #36
0
 def __data__(self):
     output = Data({k:getattr(self,k) for k in vars(self)})
     output.cause=unwraplist([c.__data__() for c in listwrap(output.cause)])
     return output
def command_loop(local):
    STDOUT.write(b'{"out":"ok"}\n')
    DEBUG and Log.note("python process running")

    file = File
    while not please_stop:
        line = STDIN.readline()
        try:
            command = json2value(line.decode('utf8'))
            DEBUG and Log.note("got {{command}}", command=command)

            if "import" in command:
                dummy={}
                if is_text(command['import']):
                    exec ("from " + command['import'] + " import *", dummy, context)
                else:
                    exec ("from " + command['import']['from'] + " import " + ",".join(listwrap(command['import']['vars'])), dummy, context)
                STDOUT.write(DONE)
            elif "set" in command:
                for k, v in command.set.items():
                    context[k] = v
                STDOUT.write(DONE)
            elif "get" in command:
                STDOUT.write(value2json({"out": coalesce(local.get(command['get']), context.get(command['get']))}).encode('utf8'))
                STDOUT.write(b'\n')
            elif "stop" in command:
                STDOUT.write(DONE)
                please_stop.go()
            elif "exec" in command:
                if not is_text(command['exec']):
                    Log.error("exec expects only text")
                exec (command['exec'], context, local)
                STDOUT.write(DONE)
            else:
                for k, v in command.items():
                    if is_list(v):
                        exec ("_return = " + k + "(" + ",".join(map(value2json, v)) + ")", context, local)
                    else:
                        exec ("_return = " + k + "(" + ",".join(kk + "=" + value2json(vv) for kk, vv in v.items()) + ")", context, local)
                    STDOUT.write(value2json({"out": local['_return']}).encode('utf8'))
                    STDOUT.write(b'\n')
        except Exception as e:
            e = Except.wrap(e)
            STDOUT.write(value2json({"err": e}).encode('utf8'))
            STDOUT.write(b'\n')
        finally:
            STDOUT.flush()
Exemple #38
0
 def values(self):
     return listwrap(dict.values(self))
Exemple #39
0
def value_compare(left, right, ordering=1):
    """
    SORT VALUES, NULL IS THE LEAST VALUE
    :param left: LHS
    :param right: RHS
    :param ordering: (-1, 0, 0) TO AFFECT SORT ORDER
    :return: The return value is negative if x < y, zero if x == y and strictly positive if x > y.
    """

    try:
        if isinstance(left, list) or isinstance(right, list):
            if left == None:
                return ordering
            elif right == None:
                return -ordering

            left = listwrap(left)
            right = listwrap(right)
            for a, b in zip(left, right):
                c = value_compare(a, b) * ordering
                if c != 0:
                    return c

            if len(left) < len(right):
                return -ordering
            elif len(left) > len(right):
                return ordering
            else:
                return 0

        ltype = type(left)
        rtype = type(right)
        ltype_num = TYPE_ORDER.get(ltype, 10)
        rtype_num = TYPE_ORDER.get(rtype, 10)
        type_diff = ltype_num - rtype_num
        if type_diff != 0:
            return ordering if type_diff > 0 else -ordering

        if ltype_num == 9:
            return 0
        elif ltype is builtin_tuple:
            for a, b in zip(left, right):
                c = value_compare(a, b)
                if c != 0:
                    return c * ordering
            return 0
        elif ltype in (dict, Data):
            for k in sorted(set(left.keys()) | set(right.keys())):
                c = value_compare(left.get(k), right.get(k)) * ordering
                if c != 0:
                    return c
            return 0
        elif left > right:
            return ordering
        elif left < right:
            return -ordering
        else:
            return 0
    except Exception as e:
        Log.error("Can not compare values {{left}} to {{right}}",
                  left=left,
                  right=right,
                  cause=e)
Exemple #40
0
def es_aggsop(es, frum, query):
    query = query.copy()  # WE WILL MARK UP THIS QUERY
    schema = frum.schema
    select = listwrap(query.select)

    es_query = Data()
    new_select = Data()  # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING
    formula = []
    for s in select:
        if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".":
            if schema.query_path == ".":
                s.pull = jx_expression_to_function("doc_count")
            else:
                s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]})
        elif isinstance(s.value, Variable):
            if s.aggregate == "count":
                new_select["count_"+literal_field(s.value.var)] += [s]
            else:
                new_select[literal_field(s.value.var)] += [s]
        elif s.aggregate:
            formula.append(s)

    for canonical_name, many in new_select.items():
        for s in many:
            columns = frum.schema.values(s.value.var)

            if s.aggregate == "count":
                canonical_names = []
                for column in columns:
                    cn = literal_field(column.es_column + "_count")
                    if column.jx_type == EXISTS:
                        canonical_names.append(cn + ".doc_count")
                        es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}}
                    else:
                        canonical_names.append(cn+ ".value")
                    es_query.aggs[cn].value_count.field = column.es_column
                if len(canonical_names) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0])
                else:
                    s.pull = jx_expression_to_function({"add": canonical_names})
            elif s.aggregate == "median":
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")

                es_query.aggs[key].percentiles.field = columns[0].es_column
                es_query.aggs[key].percentiles.percents += [50]
                s.pull = jx_expression_to_function(key + ".values.50\\.0")
            elif s.aggregate == "percentile":
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")
                if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile:
                    Log.error("Expecting percentile to be a float from 0.0 to 1.0")
                percent = Math.round(s.percentile * 100, decimal=6)

                es_query.aggs[key].percentiles.field = columns[0].es_column
                es_query.aggs[key].percentiles.percents += [percent]
                es_query.aggs[key].percentiles.compression = 2
                s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
            elif s.aggregate == "cardinality":
                canonical_names = []
                for column in columns:
                    cn = literal_field(column.es_column + "_cardinality")
                    canonical_names.append(cn)
                    es_query.aggs[cn].cardinality.field = column.es_column
                if len(columns) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0] + ".value")
                else:
                    s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0})
            elif s.aggregate == "stats":
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # REGULAR STATS
                stats_name = literal_field(canonical_name)
                es_query.aggs[stats_name].extended_stats.field = columns[0].es_column

                # GET MEDIAN TOO!
                median_name = literal_field(canonical_name + "_percentile")
                es_query.aggs[median_name].percentiles.field = columns[0].es_column
                es_query.aggs[median_name].percentiles.percents += [50]

                s.pull = get_pull_stats(stats_name, median_name)
            elif s.aggregate == "union":
                pulls = []
                for column in columns:
                    stats_name = encode_property(column.es_column)

                    if column.nested_path[0] == ".":
                        es_query.aggs[stats_name] = {"terms": {
                            "field": column.es_column,
                            "size": Math.min(s.limit, MAX_LIMIT)
                        }}
                        pulls.append(get_bucket_keys(stats_name))

                    else:
                        es_query.aggs[stats_name] = {
                            "nested": {"path": column.nested_path[0]},
                            "aggs": {"_nested": {"terms": {
                                "field": column.es_column,
                                "size": Math.min(s.limit, MAX_LIMIT)
                            }}}
                        }
                        pulls.append(get_bucket_keys(stats_name+"._nested"))
                if len(pulls) == 0:
                    s.pull = NULL
                elif len(pulls) == 1:
                    s.pull = pulls[0]
                else:
                    s.pull = lambda row: UNION(
                        p(row)
                        for p in pulls
                    )
            else:
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")

                # PULL VALUE OUT OF THE stats AGGREGATE
                es_query.aggs[literal_field(canonical_name)].extended_stats.field = columns[0].es_column
                s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]})

    for i, s in enumerate(formula):
        canonical_name = literal_field(s.name)

        if isinstance(s.value, TupleOp):
            if s.aggregate == "count":
                # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY
                s.pull = "doc_count"
            else:
                Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate)
        elif s.aggregate == "count":
            es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es_script(schema).script(schema)
            s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value")
        elif s.aggregate == "median":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")

            es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema)
            es_query.aggs[key].percentiles.percents += [50]
            s.pull = jx_expression_to_function(key + ".values.50\\.0")
        elif s.aggregate == "percentile":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            percent = Math.round(s.percentile * 100, decimal=6)

            es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema)
            es_query.aggs[key].percentiles.percents += [percent]
            s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
        elif s.aggregate == "cardinality":
            # ES USES DIFFERENT METHOD FOR CARDINALITY
            key = canonical_name + " cardinality"

            es_query.aggs[key].cardinality.script = s.value.to_es_script(schema).script(schema)
            s.pull = jx_expression_to_function(key + ".value")
        elif s.aggregate == "stats":
            # REGULAR STATS
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].extended_stats.script = s.value.to_es_script(schema).script(schema)

            # GET MEDIAN TOO!
            median_name = literal_field(canonical_name + " percentile")
            es_query.aggs[median_name].percentiles.script = s.value.to_es_script(schema).script(schema)
            es_query.aggs[median_name].percentiles.percents += [50]

            s.pull = get_pull_stats(stats_name, median_name)
        elif s.aggregate=="union":
            # USE TERMS AGGREGATE TO SIMULATE union
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].terms.script_field = s.value.to_es_script(schema).script(schema)
            s.pull = jx_expression_to_function(stats_name + ".buckets.key")
        else:
            # PULL VALUE OUT OF THE stats AGGREGATE
            s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate])
            es_query.aggs[canonical_name].extended_stats.script = s.value.to_es_script(schema).script(schema)

    decoders = get_decoders_by_depth(query)
    start = 0

    #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested
    split_where = split_expression_by_depth(query.where, schema=frum.schema)

    if len(split_field(frum.name)) > 1:
        if any(split_where[2::]):
            Log.error("Where clause is too deep")

        for d in decoders[1]:
            es_query = d.append_query(es_query, start)
            start += d.num_columns

        if split_where[1]:
            #TODO: INCLUDE FILTERS ON EDGES
            filter_ = AndOp("and", split_where[1]).to_esfilter(schema)
            es_query = Data(
                aggs={"_filter": set_default({"filter": filter_}, es_query)}
            )

        es_query = wrap({
            "aggs": {"_nested": set_default(
                {"nested": {"path": schema.query_path[0]}},
                es_query
            )}
        })
    else:
        if any(split_where[1::]):
            Log.error("Where clause is too deep")

    if decoders:
        for d in jx.reverse(decoders[0]):
            es_query = d.append_query(es_query, start)
            start += d.num_columns

    if split_where[0]:
        #TODO: INCLUDE FILTERS ON EDGES
        filter = AndOp("and", split_where[0]).to_esfilter(schema)
        es_query = Data(
            aggs={"_filter": set_default({"filter": filter}, es_query)}
        )
    # </TERRIBLE SECTION>

    if not es_query:
        es_query = wrap({"query": {"match_all": {}}})

    es_query.size = 0

    with Timer("ES query time") as es_duration:
        result = es_post(es, es_query, query.limit)

    try:
        format_time = Timer("formatting")
        with format_time:
            decoders = [d for ds in decoders for d in ds]
            result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total)  # IT APPEARS THE OLD doc_count IS GONE

            formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format]
            if query.edges:
                output = formatter(decoders, result.aggregations, start, query, select)
            elif query.groupby:
                output = groupby_formatter(decoders, result.aggregations, start, query, select)
            else:
                output = aggop_formatter(decoders, result.aggregations, start, query, select)

        output.meta.timing.formatting = format_time.duration
        output.meta.timing.es_search = es_duration.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        if query.format not in format_dispatch:
            Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e)
        Log.error("Some problem", cause=e)
Exemple #41
0
def es_setop(es, query):
    schema = query.frum.schema

    es_query, filters = es_query_template(schema.query_path[0])
    nested_filter = None
    set_default(filters[0], query.where.partial_eval().to_es14_filter(schema))
    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)
    es_query.fields = FlatList()

    selects = wrap([s.copy() for s in listwrap(query.select)])
    new_select = FlatList()
    schema = query.frum.schema
    # columns = schema.columns
    # nested_columns = set(c.name for c in columns if c.nested_path[0] != ".")

    es_query.sort = jx_sort_to_es_sort(query.sort, schema)

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if isinstance(select.value, LeavesOp) and isinstance(
                select.value.term, Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(
                    select.name, relative_field(untype_path(c.name), term.var))
                if c.jx_type == NESTED:
                    es_query.fields = ["_source"]
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": "."
                        },
                        "pull": get_pull_source(c.es_column)
                    })
                    put_index += 1
                elif c.nested_path[0] != ".":
                    pass  # THE NESTED PARENT WILL CAPTURE THIS
                else:
                    es_query.fields += [c.es_column]
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": "."
                        }
                    })
                    put_index += 1
        elif isinstance(select.value, Variable):
            s_column = select.value.var
            # LEAVES OF OBJECT
            leaves = schema.leaves(s_column)
            nested_selects = {}
            if leaves:
                if s_column == "." or any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    es_query.fields = ["_source"]
                    for c in leaves:
                        if len(c.nested_path) == 1:
                            jx_name = untype_path(c.name)
                            new_select.append({
                                "name":
                                select.name,
                                "value":
                                Variable(c.es_column),
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": relative_field(jx_name, s_column)
                                },
                                "pull":
                                get_pull_source(c.es_column)
                            })
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        if len(c.nested_path) == 1:
                            jx_name = untype_path(c.name)
                            if c.jx_type == NESTED:
                                es_query.fields = ["_source"]
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child":
                                        relative_field(jx_name, s_column)
                                    },
                                    "pull":
                                    get_pull_source(c.es_column)
                                })

                            else:
                                es_query.fields += [c.es_column]
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child":
                                        relative_field(jx_name, s_column)
                                    }
                                })
                        else:
                            if not nested_filter:
                                where = filters[0].copy()
                                nested_filter = [where]
                                for k in filters[0].keys():
                                    filters[0][k] = None
                                set_default(
                                    filters[0],
                                    es_and([where, es_or(nested_filter)]))

                            nested_path = c.nested_path[0]
                            if nested_path not in nested_selects:
                                where = nested_selects[nested_path] = Data()
                                nested_filter += [where]
                                where.nested.path = nested_path
                                where.nested.query.match_all = {}
                                where.nested.inner_hits._source = False
                                where.nested.inner_hits.fields += [c.es_column]

                                child = relative_field(
                                    untype_path(
                                        relative_field(c.name,
                                                       schema.query_path[0])),
                                    s_column)
                                pull = accumulate_nested_doc(
                                    nested_path,
                                    Variable(
                                        relative_field(
                                            s_column,
                                            unnest_path(nested_path))))
                                new_select.append({
                                    "name": select.name,
                                    "value": select.value,
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": child
                                    },
                                    "pull": pull
                                })
                            else:
                                nested_selects[
                                    nested_path].nested.inner_hits.fields += [
                                        c.es_column
                                    ]
            else:
                new_select.append({
                    "name": select.name,
                    "value": Variable("$dummy"),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    }
                })
            put_index += 1
        else:
            painless = select.value.partial_eval().to_es14_script(schema)
            es_query.script_fields[literal_field(select.name)] = es_script(
                painless.script(schema))
            new_select.append({
                "name":
                select.name,
                "pull":
                jx_expression_to_function("fields." +
                                          literal_field(select.name)),
                "put": {
                    "name": select.name,
                    "index": put_index,
                    "child": "."
                }
            })
            put_index += 1

    for n in new_select:
        if n.pull:
            continue
        elif isinstance(n.value, Variable):
            if es_query.fields[0] == "_source":
                es_query.fields = ["_source"]
                n.pull = get_pull_source(n.value.var)
            if n.value.var == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(
                    concat_field("fields", literal_field(n.value.var)))
        else:
            Log.error("Do not know what to do")

    with Timer("call to ES", silent=not DEBUG) as call_timer:
        data = es_post(es, es_query, query.limit)

    T = data.hits.hits

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
    def get_revision(self,
                     revision,
                     locale=None,
                     get_diff=False,
                     get_moves=True):
        """
        EXPECTING INCOMPLETE revision OBJECT
        RETURNS revision
        """
        rev = revision.changeset.id
        if not rev:
            return Null
        elif rev == "None":
            return Null
        elif revision.branch.name == None:
            return Null
        locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)
        output = self._get_from_elasticsearch(revision,
                                              locale=locale,
                                              get_diff=get_diff)
        if output:
            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            DEBUG and Log.note(
                "Got hg ({{branch}}, {{locale}}, {{revision}}) from ES",
                branch=output.branch.name,
                locale=locale,
                revision=output.changeset.id)
            if output.push.date >= Date.now() - MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents)))
                self.todo.add((output.branch, listwrap(output.children)))
            if output.push.date:
                return output

        # RATE LIMIT CALLS TO HG (CACHE MISSES)
        next_cache_miss = self.last_cache_miss + (
            Random.float(WAIT_AFTER_CACHE_MISS.seconds * 2) * SECOND)
        self.last_cache_miss = Date.now()
        if next_cache_miss > self.last_cache_miss:
            Log.note("delaying next hg call for {{seconds|round(decimal=1)}}",
                     seconds=next_cache_miss - self.last_cache_miss)
            Till(till=next_cache_miss.unix).wait()

        found_revision = copy(revision)
        if isinstance(found_revision.branch, (text_type, binary_type)):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}",
                      rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name,
                                                       DEFAULT_LOCALE)]
            if not b:
                Log.warning("can not find branch ({{branch}}, {{locale}})",
                            branch=lower_name,
                            locale=locale)
                return Null

        if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH:
            self.branches = _hg_branches.get_branches(kwargs=self.settings)

        push = self._get_push(found_revision.branch,
                              found_revision.changeset.id)

        url1 = found_revision.branch.url.rstrip(
            "/") + "/json-info?node=" + found_revision.changeset.id[0:12]
        url2 = found_revision.branch.url.rstrip(
            "/") + "/json-rev/" + found_revision.changeset.id[0:12]
        with Explanation("get revision from {{url}}", url=url1, debug=DEBUG):
            raw_rev2 = Null
            try:
                raw_rev1 = self._get_raw_json_info(url1, found_revision.branch)
                raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch)
            except Exception as e:
                if "Hg denies it exists" in e:
                    raw_rev1 = Data(node=revision.changeset.id)
                else:
                    raise e
            output = self._normalize_revision(set_default(raw_rev1, raw_rev2),
                                              found_revision, push, get_diff,
                                              get_moves)
            if output.push.date >= Date.now() - MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents)))
                self.todo.add((output.branch, listwrap(output.children)))

            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            return output
Exemple #43
0
    def create_table(
        self,
        table,
        schema=None,
        typed=True,
        read_only=True,  # TO PREVENT ACCIDENTAL WRITING
        sharded=False,
        partition=Null,  # PARTITION RULES
        cluster=None,  # TUPLE OF FIELDS TO SORT DATA
        top_level_fields=Null,
        kwargs=None,
    ):
        if kwargs.lookup != None or kwargs.flake != None:
            Log.error("expecting schema, not lookup")
        full_name = self.full_name + escape_name(table)
        if not schema:
            # WE MUST HAVE SOMETHING
            if typed:
                schema = copy(DEFAULT_TYPED_SCHEMA)
            else:
                schema = copy(DEFAULT_SCHEMA)

        flake = Snowflake(text(full_name),
                          top_level_fields,
                          partition,
                          schema=schema)

        if read_only:
            Log.error("Can not create a table for read-only use")

        if sharded:
            shard_name = escape_name(table + "_" +
                                     "".join(Random.sample(ALLOWED, 20)))
            shard_api_name = self.full_name + shard_name
            _shard = bigquery.Table(text(shard_api_name),
                                    schema=flake.to_bq_schema())
            _shard.time_partitioning = unwrap(
                flake._partition.bq_time_partitioning)
            _shard.clustering_fields = [
                c.es_column for f in listwrap(cluster)
                for c in [first(flake.leaves(f))] if c
            ] or None
            self.shard = self.client.create_table(_shard)
            self.create_view(full_name, shard_api_name)
        else:
            _table = bigquery.Table(text(full_name),
                                    schema=flake.to_bq_schema())
            _table.time_partitioning = unwrap(
                flake._partition.bq_time_partitioning)
            _table.clustering_fields = [
                l.es_column for f in listwrap(cluster) for l in flake.leaves(f)
            ] or None
            self.client.create_table(_table)
            DEBUG and Log.note("created table {{table}}",
                               table=_table.table_id)

        return Table(
            table=table,
            typed=typed,
            read_only=read_only,
            sharded=sharded,
            partition=partition,
            top_level_fields=top_level_fields,
            kwargs=kwargs,
            container=self,
        )
Exemple #44
0
def es_setop(es, mvel, query):
    FromES = es09.util.build_es_query(query)
    select = listwrap(query.select)

    isDeep = len(split_field(
        query.frum.name)) > 1  # LOOKING INTO NESTED WILL REQUIRE A SCRIPT
    isComplex = OR([
        s.value == None and s.aggregate not in ("count", "none")
        for s in select
    ])  # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT

    if not isDeep and not isComplex:
        if len(select) == 1 and not select[0].value or select[0].value == "*":
            FromES = wrap({
                "query": {
                    "filtered": {
                        "query": {
                            "match_all": {}
                        },
                        "filter":
                        simplify_esfilter(
                            jx_expression(query.where).to_esfilter())
                    }
                },
                "sort": query.sort,
                "size": 1
            })
        elif all(isinstance(v, Variable) for v in select.value):
            FromES = wrap({
                "query": {
                    "filtered": {
                        "query": {
                            "match_all": {}
                        },
                        "filter": simplify_esfilter(query.where.to_esfilter())
                    }
                },
                "fields": select.value,
                "sort": query.sort,
                "size": coalesce(query.limit, 200000)
            })
    elif not isDeep:
        simple_query = query.copy()
        simple_query.where = TRUE_FILTER  # THE FACET FILTER IS FASTER
        FromES.facets.mvel = {
            "terms": {
                "script_field": mvel.code(simple_query),
                "size": coalesce(simple_query.limit, 200000)
            },
            "facet_filter":
            simplify_esfilter(jx_expression(query.where).to_esfilter())
        }
    else:
        FromES.facets.mvel = {
            "terms": {
                "script_field": mvel.code(query),
                "size": coalesce(query.limit, 200000)
            },
            "facet_filter":
            simplify_esfilter(jx_expression(query.where).to_esfilter())
        }

    data = es09.util.post(es, FromES, query.limit)

    if len(select) == 1 and not select[0].value or select[0].value == "*":
        # SPECIAL CASE FOR SINGLE COUNT
        cube = wrap(data).hits.hits._source
    elif isinstance(select[0].value, Variable):
        # SPECIAL CASE FOR SINGLE TERM
        cube = wrap(data).hits.hits.fields
    else:
        data_list = unpack_terms(data.facets.mvel, select)
        if not data_list:
            cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select})
        else:
            output = zip(*data_list)
            cube = Cube(
                select, [],
                {s.name: Matrix(list=output[i])
                 for i, s in enumerate(select)})

    return Data(meta={"esquery": FromES}, data=cube)
Exemple #45
0
 def new_instance(desc):
     return Except(desc.type, desc.template, desc.params,
                   [Except.new_instance(c) for c in listwrap(desc.cause)],
                   desc.trace)
Exemple #46
0
    def _setop(self, query):
        """
        NO AGGREGATION, SIMPLE LIST COMPREHENSION
        """
        if isinstance(query.select, list):
            # RETURN BORING RESULT SET
            selects = FlatList()
            for s in listwrap(query.select):
                if isinstance(s.value, Mapping):
                    for k, v in s.value.items:
                        selects.append(v + " AS " + self.db.quote_column(s.name + "." + k))
                if isinstance(s.value, list):
                    for i, ss in enumerate(s.value):
                        selects.append(s.value + " AS " + self.db.quote_column(s.name + "," + str(i)))
                else:
                    selects.append(s.value + " AS " + self.db.quote_column(s.name))

            sql = expand_template("""
                SELECT
                    {{selects}}
                FROM
                    {{table}}
                {{where}}
                {{sort}}
                {{limit}}
            """, {
                "selects": SQL(",\n".join(selects)),
                "table": self._subquery(query["from"])[0],
                "where": self._where2sql(query.where),
                "limit": self._limit2sql(query.limit),
                "sort": self._sort2sql(query.sort)
            })

            def post_process(sql):
                result = self.db.query(sql)
                for s in listwrap(query.select):
                    if isinstance(s.value, Mapping):
                        for r in result:
                            r[s.name] = {}
                            for k, v in s.value:
                                r[s.name][k] = r[s.name + "." + k]
                                r[s.name + "." + k] = None

                    if isinstance(s.value, list):
                        # REWRITE AS TUPLE
                        for r in result:
                            r[s.name] = tuple(r[s.name + "," + str(i)] for i, ss in enumerate(s.value))
                            for i, ss in enumerate(s.value):
                                r[s.name + "," + str(i)] = None

                expand_json(result)
                return result

            return sql, post_process  # RETURN BORING RESULT SET
        else:
            # RETURN LIST OF VALUES
            if query.select.value == ".":
                select = "*"
            else:
                name = query.select.name
                select = query.select.value + " AS " + self.db.quote_column(name)

            sql = expand_template("""
                SELECT
                    {{selects}}
                FROM
                    {{table}}
                {{where}}
                {{sort}}
                {{limit}}
            """, {
                "selects": SQL(select),
                "table": self._subquery(query["from"])[0],
                "where": self._where2sql(query.where),
                "limit": self._limit2sql(query.limit),
                "sort": self._sort2sql(query.sort)
            })

            if query.select.value == ".":
                def post(sql):
                    result = self.db.query(sql)
                    expand_json(result)
                    return result

                return sql, post
            else:
                return sql, lambda sql: [r[name] for r in self.db.query(sql)]  # RETURNING LIST OF VALUES
Exemple #47
0
def cube_aggs(frum, query):
    select = listwrap(query.select)

    #MATCH EDGES IN QUERY TO ONES IN frum
    for e in query.edges:
        for fs in frum.select:
            if fs.name == e.value:
                Log.error("Not implemented yet")
        if isinstance(e.domain, DefaultDomain):
            # DEFAULT DOMAINS CAN EASILY BE LOOKED UP FROM frum
            for fe in frum.edges:
                if fe.name == e.value:
                    e.domain = SimpleSetDomain(**fe.domain.__data__())
                    e.value = e.value + "." + fe.domain.key
                    break
        else:
            for fe in frum.edges:
                if fe.name == e.value:
                    e.value = e.value + "." + fe.domain.key
                    break

    result = {
        s.name: Matrix(dims=[
            len(e.domain.partitions) + (1 if e.allowNulls else 0)
            for e in query.edges
        ],
                       zeros=s.default)
        for s in select
    }
    where = jx_expression_to_function(query.where)
    for d in filter(where, frum.values()):
        coord = [
        ]  # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE
        for e in query.edges:
            matches = get_matches(e, d)
            coord.append(matches)
            if len(matches) == 1 and d[e.name] == None:
                d[e.name] = e.domain.partitions[matches[0]]

        for s in select:
            mat = result[s.name]
            agg = s.aggregate
            var = s.value
            expr = jx_expression_to_function(var)
            val = expr(d)
            if agg == "count":
                if var == "." or var == None:
                    for c in itertools.product(*coord):
                        mat[c] += 1
                    continue

                if val != None:
                    for c in itertools.product(*coord):
                        mat[c] += 1
            else:
                for c in itertools.product(*coord):
                    acc = mat[c]
                    if acc == None:
                        acc = windows.name2accumulator.get(agg)
                        if acc == None:
                            Log.error(
                                "select aggregate {{agg}} is not recognized",
                                agg=agg)
                        acc = acc(**s)
                        mat[c] = acc
                    acc.add(val)

    for s in select:
        if s.aggregate == "count":
            continue
        m = result[s.name]
        for c, var in m.items():
            if var != None:
                m[c] = var.end()

    from pyLibrary.queries.containers.cube import Cube

    return Cube(select, query.edges, result)
def _normalize_edges(edges, limit, schema=None):
    return list_to_data([
        n for ie, e in enumerate(listwrap(edges))
        for n in _normalize_edge(e, ie, limit=limit, schema=schema)
    ])
Exemple #49
0
def is_aggsop(es, query):
    if query.edges or query.groupby or any(
            a != None and a != "none"
            for a in listwrap(query.select).aggregate):
        return True
    return False
Exemple #50
0
def value_compare(left, right, ordering=1):
    """
    SORT VALUES, NULL IS THE LEAST VALUE
    :param left: LHS
    :param right: RHS
    :param ordering: (-1, 0, 1) TO AFFECT SORT ORDER
    :return: The return value is negative if x < y, zero if x == y and strictly positive if x > y.
    """

    try:
        ltype = left.__class__
        rtype = right.__class__

        if ltype in list_types or rtype in list_types:
            if left == None:
                return ordering
            elif right == None:
                return -ordering

            left = listwrap(left)
            right = listwrap(right)
            for a, b in zip(left, right):
                c = value_compare(a, b) * ordering
                if c != 0:
                    return c

            if len(left) < len(right):
                return -ordering
            elif len(left) > len(right):
                return ordering
            else:
                return 0

        if ltype is float and isnan(left):
            left = None
            ltype = none_type
        if rtype is float and isnan(right):
            right = None
            rtype = none_type

        null_order = ordering * 10
        ltype_num = TYPE_ORDER.get(ltype, null_order)
        rtype_num = TYPE_ORDER.get(rtype, null_order)

        type_diff = ltype_num - rtype_num
        if type_diff != 0:
            return ordering if type_diff > 0 else -ordering

        if ltype_num == null_order:
            return 0
        elif ltype is builtin_tuple:
            for a, b in zip(left, right):
                c = value_compare(a, b)
                if c != 0:
                    return c * ordering
            return 0
        elif ltype in data_types:
            for k in sorted(set(left.keys()) | set(right.keys())):
                c = value_compare(left.get(k), right.get(k)) * ordering
                if c != 0:
                    return c
            return 0
        elif left > right:
            return ordering
        elif left < right:
            return -ordering
        else:
            return 0
    except Exception as e:
        Log.error("Can not compare values {{left}} to {{right}}",
                  left=left,
                  right=right,
                  cause=e)
Exemple #51
0
    def _set_op(self, query, frum):
        # GET LIST OF COLUMNS
        base_name, primary_nested_path = tail_field(frum)
        vars_ = UNION([
            v.var for select in listwrap(query.select)
            for v in select.value.vars()
        ])
        schema = self.sf.tables[primary_nested_path].schema

        active_columns = {".": set()}
        for v in vars_:
            for c in schema.leaves(v):
                nest = c.nested_path[0]
                active_columns.setdefault(nest, set()).add(c)

        # ANY VARS MENTIONED WITH NO COLUMNS?
        for v in vars_:
            if not any(startswith_field(cname, v) for cname in schema.keys()):
                active_columns["."].add(
                    Column(name=v,
                           jx_type=IS_NULL,
                           es_column=".",
                           es_index=".",
                           nested_path=["."]))

        # EVERY COLUMN, AND THE INDEX IT TAKES UP
        index_to_column = {}  # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE)
        index_to_uid = {}  # FROM NESTED PATH TO THE INDEX OF UID
        sql_selects = [
        ]  # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE)
        nest_to_alias = {
            nested_path: "__" + unichr(ord('a') + i) + "__"
            for i, (nested_path,
                    sub_table) in enumerate(self.sf.tables.items())
        }

        sorts = []
        if query.sort:
            for select in query.sort:
                col = select.value.to_sql(schema)[0]
                for t, sql in col.sql.items():
                    json_type = sql_type_to_json_type[t]
                    if json_type in STRUCT:
                        continue
                    column_number = len(sql_selects)
                    # SQL HAS ABS TABLE REFERENCE
                    column_alias = _make_column_name(column_number)
                    sql_selects.append(sql_alias(sql, column_alias))
                    if select.sort == -1:
                        sorts.append(column_alias + SQL_IS_NOT_NULL)
                        sorts.append(column_alias + " DESC")
                    else:
                        sorts.append(column_alias + SQL_IS_NULL)
                        sorts.append(column_alias)

        primary_doc_details = Data()
        # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH
        # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED
        nested_path = []
        for step, sub_table in self.sf.tables.items():
            nested_path.insert(0, step)
            nested_doc_details = {
                "sub_table": sub_table,
                "children": [],
                "index_to_column": {},
                "nested_path": nested_path
            }

            # INSERT INTO TREE
            if not primary_doc_details:
                primary_doc_details = nested_doc_details
            else:

                def place(parent_doc_details):
                    if startswith_field(step,
                                        parent_doc_details['nested_path'][0]):
                        for c in parent_doc_details['children']:
                            if place(c):
                                return True
                        parent_doc_details['children'].append(
                            nested_doc_details)

                place(primary_doc_details)

            alias = nested_doc_details['alias'] = nest_to_alias[step]

            # WE ALWAYS ADD THE UID
            column_number = index_to_uid[step] = nested_doc_details[
                'id_coord'] = len(sql_selects)
            sql_select = join_column(alias, quoted_UID)
            sql_selects.append(
                sql_alias(sql_select, _make_column_name(column_number)))
            if step != ".":
                # ID AND ORDER FOR CHILD TABLES
                index_to_column[column_number] = ColumnMapping(
                    sql=sql_select,
                    type="number",
                    nested_path=nested_path,
                    column_alias=_make_column_name(column_number))
                column_number = len(sql_selects)
                sql_select = join_column(alias, quoted_ORDER)
                sql_selects.append(
                    sql_alias(sql_select, _make_column_name(column_number)))
                index_to_column[column_number] = ColumnMapping(
                    sql=sql_select,
                    type="number",
                    nested_path=nested_path,
                    column_alias=_make_column_name(column_number))

            # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM
            if step not in active_columns:
                continue

            # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE
            si = 0
            for select in listwrap(query.select):
                try:
                    column_number = len(sql_selects)
                    select.pull = get_column(column_number)
                    db_columns = select.value.partial_eval().to_sql(schema)

                    for column in db_columns:
                        if is_list(column.nested_path):
                            column.nested_path = column.nested_path[
                                0]  # IN THE EVENT THIS "column" IS MULTIVALUED
                        for t, unsorted_sql in column.sql.items():
                            json_type = sql_type_to_json_type[t]
                            if json_type in STRUCT:
                                continue
                            column_number = len(sql_selects)
                            column_alias = _make_column_name(column_number)
                            sql_selects.append(
                                sql_alias(unsorted_sql, column_alias))
                            if startswith_field(primary_nested_path,
                                                step) and is_op(
                                                    select.value, LeavesOp):
                                # ONLY FLATTEN primary_nested_path AND PARENTS, NOT CHILDREN
                                index_to_column[
                                    column_number] = nested_doc_details[
                                        'index_to_column'][
                                            column_number] = ColumnMapping(
                                                push_name=literal_field(
                                                    get_property_name(
                                                        concat_field(
                                                            select.name,
                                                            column.name))),
                                                push_child=".",
                                                push_column_name=
                                                get_property_name(
                                                    concat_field(
                                                        select.name,
                                                        column.name)),
                                                push_column=si,
                                                pull=get_column(column_number),
                                                sql=unsorted_sql,
                                                type=json_type,
                                                column_alias=column_alias,
                                                nested_path=nested_path)
                                si += 1
                            else:
                                index_to_column[
                                    column_number] = nested_doc_details[
                                        'index_to_column'][
                                            column_number] = ColumnMapping(
                                                push_name=select.name,
                                                push_child=column.name,
                                                push_column_name=select.name,
                                                push_column=si,
                                                pull=get_column(column_number),
                                                sql=unsorted_sql,
                                                type=json_type,
                                                column_alias=column_alias,
                                                nested_path=nested_path)
                finally:
                    si += 1

        where_clause = BooleanOp(query.where).partial_eval().to_sql(
            schema, boolean=True)[0].sql.b
        unsorted_sql = self._make_sql_for_one_nest_in_set_op(
            ".", sql_selects, where_clause, active_columns, index_to_column)

        for n, _ in self.sf.tables.items():
            sorts.append(quote_column(COLUMN + text_type(index_to_uid[n])))

        ordered_sql = (SQL_SELECT + "*" + SQL_FROM + sql_iso(unsorted_sql) +
                       SQL_ORDERBY + sql_list(sorts) + SQL_LIMIT +
                       quote_value(query.limit))
        self.db.create_new_functions()  # creating new functions: regexp
        result = self.db.query(ordered_sql)

        def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id,
                               parent_id_coord):
            """
            :param rows: REVERSED STACK OF ROWS (WITH push() AND pop())
            :param row: CURRENT ROW BEING EXTRACTED
            :param nested_doc_details: {
                    "nested_path": wrap_nested_path(nested_path),
                    "index_to_column": map from column number to column details
                    "children": all possible direct decedents' nested_doc_details
                 }
            :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop)
            :param parent_id_coord: the column number for the parent id (so we ca extract from each row)
            :return: the nested property (usually an array)
            """
            previous_doc_id = None
            doc = Data()
            output = []
            id_coord = nested_doc_details['id_coord']

            while True:
                doc_id = row[id_coord]

                if doc_id == None or (parent_id_coord is not None and
                                      row[parent_id_coord] != parent_doc_id):
                    rows.append(
                        row
                    )  # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc)
                    return output

                if doc_id != previous_doc_id:
                    previous_doc_id = doc_id
                    doc = Data()
                    curr_nested_path = nested_doc_details['nested_path'][0]
                    index_to_column = nested_doc_details[
                        'index_to_column'].items()
                    if index_to_column:
                        for i, c in index_to_column:
                            value = row[i]
                            if is_list(query.select) or is_op(
                                    query.select.value, LeavesOp):
                                # ASSIGN INNER PROPERTIES
                                relative_field = concat_field(
                                    c.push_name, c.push_child)
                            else:  # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT
                                relative_field = c.push_child

                            if relative_field == ".":
                                if value == '':
                                    doc = Null
                                else:
                                    doc = value
                            elif value != None and value != '':
                                doc[relative_field] = value

                for child_details in nested_doc_details['children']:
                    # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS
                    child_id = row[child_details['id_coord']]
                    if child_id is not None:
                        nested_value = _accumulate_nested(
                            rows, row, child_details, doc_id, id_coord)
                        if nested_value:
                            push_name = child_details['nested_path'][0]
                            if is_list(query.select) or is_op(
                                    query.select.value, LeavesOp):
                                # ASSIGN INNER PROPERTIES
                                relative_field = relative_field(
                                    push_name, curr_nested_path)
                            else:  # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT
                                relative_field = "."

                            if relative_field == "." and doc is Null:
                                doc = nested_value
                            elif relative_field == ".":
                                doc = unwraplist(nested_value)
                            else:
                                doc[relative_field] = unwraplist(nested_value)

                output.append(doc)

                try:
                    row = rows.pop()
                except IndexError:
                    return output

        cols = tuple(
            [i for i in index_to_column.values() if i.push_name != None])
        rows = list(reversed(unwrap(result.data)))
        if rows:
            row = rows.pop()
            data = _accumulate_nested(rows, row, primary_doc_details, None,
                                      None)
        else:
            data = result.data

        if query.format == "cube":
            for f, _ in self.sf.tables.items():
                if frum.endswith(f) or (test_dots(cols)
                                        and is_list(query.select)):
                    num_rows = len(result.data)
                    num_cols = MAX([c.push_column
                                    for c in cols]) + 1 if len(cols) else 0
                    map_index_to_name = {
                        c.push_column: c.push_column_name
                        for c in cols
                    }
                    temp_data = [[None] * num_rows for _ in range(num_cols)]
                    for rownum, d in enumerate(result.data):
                        for c in cols:
                            if c.push_child == ".":
                                temp_data[c.push_column][rownum] = c.pull(d)
                            else:
                                column = temp_data[c.push_column][rownum]
                                if column is None:
                                    column = temp_data[
                                        c.push_column][rownum] = {}
                                column[c.push_child] = c.pull(d)
                    output = Data(meta={"format": "cube"},
                                  data={
                                      n: temp_data[c]
                                      for c, n in map_index_to_name.items()
                                  },
                                  edges=[{
                                      "name": "rownum",
                                      "domain": {
                                          "type": "rownum",
                                          "min": 0,
                                          "max": num_rows,
                                          "interval": 1
                                      }
                                  }])
                    return output

            if is_list(query.select) or is_op(query.select.value, LeavesOp):
                num_rows = len(data)
                temp_data = {
                    c.push_column_name: [None] * num_rows
                    for c in cols
                }
                for rownum, d in enumerate(data):
                    for c in cols:
                        temp_data[c.push_column_name][rownum] = d[c.push_name]
                return Data(meta={"format": "cube"},
                            data=temp_data,
                            edges=[{
                                "name": "rownum",
                                "domain": {
                                    "type": "rownum",
                                    "min": 0,
                                    "max": num_rows,
                                    "interval": 1
                                }
                            }])
            else:
                num_rows = len(data)
                map_index_to_name = {
                    c.push_column: c.push_column_name
                    for c in cols
                }
                temp_data = [data]

                return Data(meta={"format": "cube"},
                            data={
                                n: temp_data[c]
                                for c, n in map_index_to_name.items()
                            },
                            edges=[{
                                "name": "rownum",
                                "domain": {
                                    "type": "rownum",
                                    "min": 0,
                                    "max": num_rows,
                                    "interval": 1
                                }
                            }])

        elif query.format == "table":
            for f, _ in self.sf.tables.items():
                if frum.endswith(f):
                    num_column = MAX([c.push_column for c in cols]) + 1
                    header = [None] * num_column
                    for c in cols:
                        header[c.push_column] = c.push_column_name

                    output_data = []
                    for d in result.data:
                        row = [None] * num_column
                        for c in cols:
                            set_column(row, c.push_column, c.push_child,
                                       c.pull(d))
                        output_data.append(row)

                    return Data(meta={"format": "table"},
                                header=header,
                                data=output_data)
            if is_list(query.select) or is_op(query.select.value, LeavesOp):
                column_names = [None] * (max(c.push_column for c in cols) + 1)
                for c in cols:
                    column_names[c.push_column] = c.push_column_name

                temp_data = []
                for rownum, d in enumerate(data):
                    row = [None] * len(column_names)
                    for c in cols:
                        row[c.push_column] = d[c.push_name]
                    temp_data.append(row)

                return Data(meta={"format": "table"},
                            header=column_names,
                            data=temp_data)
            else:
                column_names = listwrap(query.select).name
                return Data(meta={"format": "table"},
                            header=column_names,
                            data=[[d] for d in data])

        else:
            for f, _ in self.sf.tables.items():
                if frum.endswith(f) or (test_dots(cols)
                                        and is_list(query.select)):
                    data = []
                    for d in result.data:
                        row = Data()
                        for c in cols:
                            if c.push_child == ".":
                                row[c.push_name] = c.pull(d)
                            elif c.num_push_columns:
                                tuple_value = row[c.push_name]
                                if not tuple_value:
                                    tuple_value = row[c.push_name] = [
                                        None
                                    ] * c.num_push_columns
                                tuple_value[c.push_child] = c.pull(d)
                            else:
                                row[c.push_name][c.push_child] = c.pull(d)

                        data.append(row)

                    return Data(meta={"format": "list"}, data=data)

            if is_list(query.select) or is_op(query.select.value, LeavesOp):
                temp_data = []
                for rownum, d in enumerate(data):
                    row = {}
                    for c in cols:
                        row[c.push_column_name] = d[c.push_name]
                    temp_data.append(row)
                return Data(meta={"format": "list"}, data=temp_data)
            else:
                return Data(meta={"format": "list"}, data=data)
Exemple #52
0
def es_aggsop(es, frum, query):
    query = query.copy()  # WE WILL MARK UP THIS QUERY
    schema = frum.schema
    query_path = schema.query_path[0]
    select = listwrap(query.select)

    new_select = Data(
    )  # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING
    formula = []
    for s in select:
        if is_op(s.value, Variable_):
            s.query_path = query_path
            if s.aggregate == "count":
                new_select["count_" + literal_field(s.value.var)] += [s]
            else:
                new_select[literal_field(s.value.var)] += [s]
        elif s.aggregate:
            split_select = split_expression_by_path(s.value,
                                                    schema,
                                                    lang=Painless)
            for si_key, si_value in split_select.items():
                if si_value:
                    if s.query_path:
                        Log.error(
                            "can not handle more than one depth per select")
                    s.query_path = si_key
            formula.append(s)

    acc = Aggs()
    for _, many in new_select.items():
        for s in many:
            canonical_name = s.name
            if s.aggregate in ("value_count", "count"):
                columns = frum.schema.values(s.value.var,
                                             exclude_type=(OBJECT, NESTED))
            else:
                columns = frum.schema.values(s.value.var)

            if s.aggregate == "count":
                canonical_names = []
                for column in columns:
                    es_name = column.es_column + "_count"
                    if column.jx_type == EXISTS:
                        if column.nested_path[0] == query_path:
                            canonical_names.append("doc_count")
                            acc.add(
                                NestedAggs(column.nested_path[0]).add(
                                    CountAggs(s)))
                    else:
                        canonical_names.append("value")
                        acc.add(
                            NestedAggs(column.nested_path[0]).add(
                                ExprAggs(es_name, {
                                    "value_count": {
                                        "field": column.es_column
                                    }
                                }, s)))
                if len(canonical_names) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0])
                else:
                    s.pull = jx_expression_to_function(
                        {"add": canonical_names})
            elif s.aggregate == "median":
                columns = [
                    c for c in columns if c.jx_type in (NUMBER, INTEGER)
                ]
                if len(columns) != 1:
                    Log.error(
                        "Do not know how to perform median on columns with more than one type (script probably)"
                    )
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = canonical_name + " percentile"
                acc.add(
                    ExprAggs(
                        key, {
                            "percentiles": {
                                "field": first(columns).es_column,
                                "percents": [50]
                            }
                        }, s))
                s.pull = jx_expression_to_function("values.50\\.0")
            elif s.aggregate == "percentile":
                columns = [
                    c for c in columns if c.jx_type in (NUMBER, INTEGER)
                ]
                if len(columns) != 1:
                    Log.error(
                        "Do not know how to perform percentile on columns with more than one type (script probably)"
                    )
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = canonical_name + " percentile"
                if is_text(
                        s.percentile) or s.percetile < 0 or 1 < s.percentile:
                    Log.error(
                        "Expecting percentile to be a float from 0.0 to 1.0")
                percent = mo_math.round(s.percentile * 100, decimal=6)

                acc.add(
                    ExprAggs(
                        key, {
                            "percentiles": {
                                "field": first(columns).es_column,
                                "percents": [percent],
                                "tdigest": {
                                    "compression": 2
                                }
                            }
                        }, s))
                s.pull = jx_expression_to_function(
                    join_field(["values", text(percent)]))
            elif s.aggregate == "cardinality":
                for column in columns:
                    path = column.es_column + "_cardinality"
                    acc.add(
                        ExprAggs(path,
                                 {"cardinality": {
                                     "field": column.es_column
                                 }}, s))
                s.pull = jx_expression_to_function("value")
            elif s.aggregate == "stats":
                columns = [
                    c for c in columns if c.jx_type in (NUMBER, INTEGER)
                ]
                if len(columns) != 1:
                    Log.error(
                        "Do not know how to perform stats on columns with more than one type (script probably)"
                    )
                # REGULAR STATS
                acc.add(
                    ExprAggs(canonical_name, {
                        "extended_stats": {
                            "field": first(columns).es_column
                        }
                    }, s))
                s.pull = get_pull_stats()

                # GET MEDIAN TOO!
                select_median = s.copy()
                select_median.pull = jx_expression_to_function(
                    {"select": [{
                        "name": "median",
                        "value": "values.50\\.0"
                    }]})

                acc.add(
                    ExprAggs(
                        canonical_name + "_percentile", {
                            "percentiles": {
                                "field": first(columns).es_column,
                                "percents": [50]
                            }
                        }, select_median))

            elif s.aggregate == "union":
                for column in columns:
                    script = {
                        "scripted_metric": {
                            'init_script':
                            'params._agg.terms = new HashSet()',
                            'map_script':
                            'for (v in doc[' + quote(column.es_column) +
                            '].values) params._agg.terms.add(v);',
                            'combine_script':
                            'return params._agg.terms.toArray()',
                            'reduce_script':
                            'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()',
                        }
                    }
                    stats_name = column.es_column
                    acc.add(
                        NestedAggs(column.nested_path[0]).add(
                            ExprAggs(stats_name, script, s)))
                s.pull = jx_expression_to_function("value")
            elif s.aggregate == "count_values":
                # RETURN MAP FROM VALUE TO THE NUMBER OF TIMES FOUND IN THE DOCUMENTS
                # NOT A NESTED DOC, RATHER A MULTIVALUE FIELD
                for column in columns:
                    script = {
                        "scripted_metric": {
                            'params': {
                                "_agg": {}
                            },
                            'init_script':
                            'params._agg.terms = new HashMap()',
                            'map_script':
                            'for (v in doc[' + quote(column.es_column) +
                            '].values) params._agg.terms.put(v, Optional.ofNullable(params._agg.terms.get(v)).orElse(0)+1);',
                            'combine_script':
                            'return params._agg.terms',
                            'reduce_script':
                            '''
                            HashMap output = new HashMap(); 
                            for (agg in params._aggs) {
                                if (agg!=null){
                                    for (e in agg.entrySet()) {
                                        String key = String.valueOf(e.getKey());
                                        output.put(key, e.getValue() + Optional.ofNullable(output.get(key)).orElse(0));
                                    } 
                                }
                            } 
                            return output;
                        '''
                        }
                    }
                    stats_name = encode_property(column.es_column)
                    acc.add(
                        NestedAggs(column.nested_path[0]).add(
                            ExprAggs(stats_name, script, s)))
                s.pull = jx_expression_to_function("value")
            else:
                if not columns:
                    s.pull = jx_expression_to_function(NULL)
                else:
                    for c in columns:
                        acc.add(
                            NestedAggs(c.nested_path[0]).add(
                                ExprAggs(
                                    canonical_name,
                                    {"extended_stats": {
                                        "field": c.es_column
                                    }}, s)))
                    s.pull = jx_expression_to_function(aggregates[s.aggregate])

    for i, s in enumerate(formula):
        s_path = [
            k for k, v in split_expression_by_path(
                s.value, schema=schema, lang=Painless).items() if v
        ]
        if len(s_path) == 0:
            # FOR CONSTANTS
            nest = NestedAggs(query_path)
            acc.add(nest)
        elif len(s_path) == 1:
            nest = NestedAggs(first(s_path))
            acc.add(nest)
        else:
            Log.error("do not know how to handle")

        canonical_name = s.name
        if is_op(s.value, TupleOp):
            if s.aggregate == "count":
                # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY
                s.pull = jx_expression_to_function("doc_count")
            elif s.aggregate in ('max', 'maximum', 'min', 'minimum'):
                if s.aggregate in ('max', 'maximum'):
                    dir = 1
                    op = "max"
                else:
                    dir = -1
                    op = 'min'

                nully = Painless[TupleOp(
                    [NULL] *
                    len(s.value.terms))].partial_eval().to_es_script(schema)
                selfy = text(
                    Painless[s.value].partial_eval().to_es_script(schema))

                script = {
                    "scripted_metric": {
                        'init_script':
                        'params._agg.best = ' + nully + '.toArray();',
                        'map_script':
                        'params._agg.best = ' + expand_template(
                            MAX_OF_TUPLE, {
                                "expr1": "params._agg.best",
                                "expr2": selfy,
                                "dir": dir,
                                "op": op
                            }) + ";",
                        'combine_script':
                        'return params._agg.best',
                        'reduce_script':
                        'return params._aggs.stream().' + op + '(' +
                        expand_template(COMPARE_TUPLE, {
                            "dir": dir,
                            "op": op
                        }) + ').get()',
                    }
                }
                nest.add(
                    NestedAggs(query_path).add(
                        ExprAggs(canonical_name, script, s)))
                s.pull = jx_expression_to_function("value")
            else:
                Log.error("{{agg}} is not a supported aggregate over a tuple",
                          agg=s.aggregate)
        elif s.aggregate == "count":
            nest.add(
                ExprAggs(
                    canonical_name, {
                        "value_count": {
                            "script":
                            text(Painless[s.value].partial_eval().to_es_script(
                                schema))
                        }
                    }, s))
            s.pull = jx_expression_to_function("value")
        elif s.aggregate == "median":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            nest.add(
                ExprAggs(
                    key, {
                        "percentiles": {
                            "script": text(
                                Painless[s.value].to_es_script(schema)),
                            "percents": [50]
                        }
                    }, s))
            s.pull = jx_expression_to_function(join_field(["50.0"]))
        elif s.aggregate == "percentile":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            percent = mo_math.round(s.percentile * 100, decimal=6)
            nest.add(
                ExprAggs(
                    key, {
                        "percentiles": {
                            "script": text(
                                Painless[s.value].to_es_script(schema)),
                            "percents": [percent]
                        }
                    }, s))
            s.pull = jx_expression_to_function(
                join_field(["values", text(percent)]))
        elif s.aggregate == "cardinality":
            # ES USES DIFFERENT METHOD FOR CARDINALITY
            key = canonical_name + " cardinality"
            nest.add(
                ExprAggs(
                    key, {
                        "cardinality": {
                            "script": text(
                                Painless[s.value].to_es_script(schema))
                        }
                    }, s))
            s.pull = jx_expression_to_function("value")
        elif s.aggregate == "stats":
            # REGULAR STATS
            nest.add(
                ExprAggs(
                    canonical_name, {
                        "extended_stats": {
                            "script": text(
                                Painless[s.value].to_es_script(schema))
                        }
                    }, s))
            s.pull = get_pull_stats()

            # GET MEDIAN TOO!
            select_median = s.copy()
            select_median.pull = jx_expression_to_function(
                {"select": [{
                    "name": "median",
                    "value": "values.50\\.0"
                }]})

            nest.add(
                ExprAggs(
                    canonical_name + "_percentile", {
                        "percentiles": {
                            "script": text(
                                Painless[s.value].to_es_script(schema)),
                            "percents": [50]
                        }
                    }, select_median))
            s.pull = get_pull_stats()
        elif s.aggregate == "union":
            # USE TERMS AGGREGATE TO SIMULATE union
            nest.add(
                TermsAggs(canonical_name, {
                    "script_field":
                    text(Painless[s.value].to_es_script(schema))
                }, s))
            s.pull = jx_expression_to_function("key")
        else:
            # PULL VALUE OUT OF THE stats AGGREGATE
            s.pull = jx_expression_to_function(aggregates[s.aggregate])
            nest.add(
                ExprAggs(
                    canonical_name, {
                        "extended_stats": {
                            "script":
                            text(
                                NumberOp(s.value).partial_eval().to_es_script(
                                    schema))
                        }
                    }, s))

    acc = NestedAggs(query_path).add(acc)
    split_decoders = get_decoders_by_path(query)
    split_wheres = split_expression_by_path(query.where,
                                            schema=frum.schema,
                                            lang=ES52)

    start = 0
    decoders = [None] * (len(query.edges) + len(query.groupby))
    paths = list(reversed(sorted(split_wheres.keys() | split_decoders.keys())))
    for path in paths:
        literal_path = literal_field(path)
        decoder = split_decoders[literal_path]
        where = split_wheres[literal_path]

        for d in decoder:
            decoders[d.edge.dim] = d
            acc = d.append_query(path, acc)
            start += d.num_columns

        if where:
            acc = FilterAggs("_filter", AndOp(where), None).add(acc)
        acc = NestedAggs(path).add(acc)

    acc = NestedAggs('.').add(acc)
    acc = simplify(acc)
    es_query = wrap(acc.to_es(schema))

    es_query.size = 0

    with Timer("ES query time", silent=not DEBUG) as es_duration:
        result = es_post(es, es_query, query.limit)

    try:
        format_time = Timer("formatting", silent=not DEBUG)
        with format_time:
            # result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total)  # IT APPEARS THE OLD doc_count IS GONE
            aggs = unwrap(result.aggregations)

            formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[
                query.format]
            if query.edges:
                output = formatter(aggs, acc, query, decoders, select)
            elif query.groupby:
                output = groupby_formatter(aggs, acc, query, decoders, select)
            else:
                output = aggop_formatter(aggs, acc, query, decoders, select)

        output.meta.timing.formatting = format_time.duration
        output.meta.timing.es_search = es_duration.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        if query.format not in format_dispatch:
            Log.error("Format {{format|quote}} not supported yet",
                      format=query.format,
                      cause=e)
        Log.error("Some problem", cause=e)
Exemple #53
0
    def _grouped(self, query, stacked=False):
        select = listwrap(query.select)

        # RETURN SINGLE OBJECT WITH AGGREGATES
        for s in select:
            if s.aggregate not in aggregates:
                Log.error("Expecting all columns to have an aggregate: {{select}}", select=s)

        selects = FlatList()
        groups = FlatList()
        edges = query.edges
        for e in edges:
            if e.domain.type != "default":
                Log.error("domain of type {{type}} not supported, yet", type=e.domain.type)
            groups.append(e.value)
            selects.append(e.value + " AS " + self.db.quote_column(e.name))

        for s in select:
            selects.append(aggregates[s.aggregate].replace("{{code}}", s.value) + " AS " + self.db.quote_column(s.name))

        sql = expand_template("""
            SELECT
                {{selects}}
            FROM
                {{table}}
            {{where}}
            GROUP BY
                {{groups}}
        """, {
            "selects": SQL(",\n".join(selects)),
            "groups": SQL(",\n".join(groups)),
            "table": self._subquery(query["from"])[0],
            "where": self._where2sql(query.where)
        })

        def post_stacked(sql):
            # RETURN IN THE USUAL DATABASE RESULT SET FORMAT
            return self.db.query(sql)

        def post(sql):
            # FIND OUT THE default DOMAIN SIZES
            result = self.db.column_query(sql)
            num_edges = len(edges)
            for e, edge in enumerate(edges):
                domain = edge.domain
                if domain.type == "default":
                    domain.type = "set"
                    parts = set(result[e])
                    domain.partitions = [{"index": i, "value": p} for i, p in enumerate(parts)]
                    domain.map = {p: i for i, p in enumerate(parts)}
                else:
                    Log.error("Do not know what to do here, yet")

            # FILL THE DATA CUBE
            maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)]
            cubes = FlatList()
            for c, s in enumerate(select):
                data = Matrix(*[len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges])
                for rownum, value in enumerate(result[c + num_edges]):
                    coord = [m[r[rownum]] for m, r in maps]
                    data[coord] = value
                cubes.append(data)

            if isinstance(query.select, list):
                return cubes
            else:
                return cubes[0]

        return sql, post if not stacked else post_stacked
Exemple #54
0
def groupby(data,
            keys=None,
            size=None,
            min_size=None,
            max_size=None,
            contiguous=False):
    """
    :param data:
    :param keys:
    :param size:
    :param min_size:
    :param max_size:
    :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES
    :return: return list of (keys, values) PAIRS, WHERE
                 keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR
                 values IS GENERATOR OF ALL VALUE THAT MATCH keys
        contiguous -
    """
    if isinstance(data, Container):
        return data.groupby(keys)

    if size != None or min_size != None or max_size != None:
        if size != None:
            max_size = size
        return groupby_min_max_size(data, min_size=min_size, max_size=max_size)

    try:
        keys = listwrap(keys)
        if not contiguous:
            from jx_python import jx
            data = jx.sort(data, keys)

        if not data:
            return Null

        if any(is_expression(k) for k in keys):
            Log.error("can not handle expressions")
        else:
            accessor = jx_expression_to_function(jx_expression({
                "tuple": keys
            }))  # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__

        def _output():
            start = 0
            prev = accessor(data[0])
            for i, d in enumerate(data):
                curr = accessor(d)
                if curr != prev:
                    group = {}
                    for k, gg in zip(keys, prev):
                        group[k] = gg
                    yield Data(group), data[start:i:]
                    start = i
                    prev = curr
            group = {}
            for k, gg in zip(keys, prev):
                group[k] = gg
            yield Data(group), data[start::]

        return _output()
    except Exception as e:
        Log.error("Problem grouping", cause=e)
Exemple #55
0
def es_deepop(es, query):
    schema = query.frum.schema
    query_path = schema.query_path[0]

    # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions
    # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER:  ES ALLOWS
    # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU
    # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT
    post_expressions = {}
    es_query, es_filters = es_query_template(query_path)

    # SPLIT WHERE CLAUSE BY DEPTH
    wheres = split_expression_by_depth(query.where, schema)
    for f, w in zip_longest(es_filters, wheres):
        script = ES52[AndOp(w)].partial_eval().to_esfilter(schema)
        set_default(f, script)

    if not wheres[1]:
        # INCLUDE DOCS WITH NO NESTED DOCS
        more_filter = {
            "bool": {
                "filter":
                [AndOp(wheres[0]).partial_eval().to_esfilter(schema)],
                "must_not": {
                    "nested": {
                        "path": query_path,
                        "query": MATCH_ALL
                    }
                }
            }
        }
    else:
        more_filter = None

    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)

    # es_query.sort = jx_sort_to_es_sort(query.sort)
    map_to_es_columns = schema.map_to_es()
    # {c.name: c.es_column for c in schema.leaves(".")}
    query_for_es = query.map(map_to_es_columns)
    es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema)

    es_query.stored_fields = []

    is_list = is_list_(query.select)
    selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()

    put_index = 0
    for select in selects:
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
            leaves = schema.leaves(select.value.term.var)
            col_names = set()
            for c in leaves:
                if c.nested_path[0] == ".":
                    if c.jx_type == NESTED:
                        continue
                    es_query.stored_fields += [c.es_column]
                c_name = untype_path(relative_field(c.name, query_path))
                col_names.add(c_name)
                new_select.append({
                    "name": concat_field(select.name, c_name),
                    "nested_path": c.nested_path[0],
                    "put": {
                        "name": concat_field(select.name,
                                             literal_field(c_name)),
                        "index": put_index,
                        "child": "."
                    },
                    "pull": get_pull_function(c)
                })
                put_index += 1

            # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS
            for n in new_select:
                if n.name.startswith("..") and n.name.lstrip(
                        ".") not in col_names:
                    n.put.name = n.name = n.name.lstrip(".")
                    col_names.add(n.name)
        elif is_op(select.value, Variable):
            net_columns = schema.leaves(select.value.var)
            if not net_columns:
                new_select.append({
                    "name": select.name,
                    "nested_path": ".",
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                    "pull": NULL
                })
            else:
                for n in net_columns:
                    pull = get_pull_function(n)
                    if n.nested_path[0] == ".":
                        if n.jx_type == NESTED:
                            continue
                        es_query.stored_fields += [n.es_column]

                    # WE MUST FIGURE OUT WHICH NAMESSPACE s.value.var IS USING SO WE CAN EXTRACT THE child
                    for np in n.nested_path:
                        c_name = untype_path(relative_field(n.name, np))
                        if startswith_field(c_name, select.value.var):
                            child = relative_field(c_name, select.value.var)
                            break
                    else:
                        continue
                        # REMOVED BECAUSE SELECTING INNER PROPERTIES IS NOT ALLOWED
                        # child = relative_field(untype_path(relative_field(n.name, n.nested_path[0])), s.value.var)

                    new_select.append({
                        "name": select.name,
                        "pull": pull,
                        "nested_path": n.nested_path[0],
                        "put": {
                            "name": select.name,
                            "index": put_index,
                            "child": child
                        }
                    })
            put_index += 1
        else:
            expr = select.value
            for v in expr.vars():
                for c in schema[v.var]:
                    if c.nested_path[0] == ".":
                        es_query.stored_fields += [c.es_column]
                    # else:
                    #     Log.error("deep field not expected")

            pull_name = EXPRESSION_PREFIX + select.name
            map_to_local = MapToLocal(schema)
            pull = jx_expression_to_function(pull_name)
            post_expressions[pull_name] = jx_expression_to_function(
                expr.map(map_to_local))

            new_select.append({
                "name": select.name if is_list else ".",
                "pull": pull,
                "value": expr.__data__(),
                "put": {
                    "name": select.name,
                    "index": put_index,
                    "child": "."
                }
            })
            put_index += 1

    # <COMPLICATED> ES needs two calls to get all documents
    more = []

    def get_more(please_stop):
        more.append(
            es_post(
                es,
                Data(query=more_filter, stored_fields=es_query.stored_fields),
                query.limit))

    if more_filter:
        need_more = Thread.run("get more", target=get_more)

    with Timer("call to ES") as call_timer:
        data = es_post(es, es_query, query.limit)

    # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED
    def inners():
        for t in data.hits.hits:
            for i in t.inner_hits[literal_field(query_path)].hits.hits:
                t._inner = i._source
                for k, e in post_expressions.items():
                    t[k] = e(t)
                yield t
        if more_filter:
            Thread.join(need_more)
            for t in more[0].hits.hits:
                yield t

    # </COMPLICATED>

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        output = formatter(inners(), new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
Exemple #56
0
    def query(self, query):
        """
        :param query:  JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT
        :return:
        """
        if not query.get('from'):
            query['from'] = self.name
        elif not startswith_field(query['from'], self.name):
            Log.error("Expecting table, or some nested table")
        query = QueryOp.wrap(query, self.container, self.namespace)
        new_table = "temp_" + unique_name()

        if query.format == "container":
            create_table = SQL_CREATE + quote_column(new_table) + SQL_AS
        else:
            create_table = ""

        if query.groupby and query.format != "cube":
            op, index_to_columns = self._groupby_op(query, self.schema)
            command = create_table + op
        elif query.groupby:
            query.edges, query.groupby = query.groupby, query.edges
            op, index_to_columns = self._edges_op(query, self.schema)
            command = create_table + op
            query.edges, query.groupby = query.groupby, query.edges
        elif query.edges or any(a != "none"
                                for a in listwrap(query.select).aggregate):
            op, index_to_columns = self._edges_op(query, query.frum.schema)
            command = create_table + op
        else:
            op = self._set_op(query)
            return op

        result = self.db.query(command)

        if query.format == "container":
            output = QueryTable(new_table,
                                db=self.db,
                                uid=self.uid,
                                exists=True)
        elif query.format == "cube" or (not query.format and query.edges):
            column_names = [None
                            ] * (max(c.push_column
                                     for c in index_to_columns.values()) + 1)
            for c in index_to_columns.values():
                column_names[c.push_column] = c.push_column_name

            if len(query.edges) == 0 and len(query.groupby) == 0:
                data = {n: Data() for n in column_names}
                for s in index_to_columns.values():
                    data[s.push_name][s.push_child] = unwrap(
                        s.pull(result.data[0]))
                if is_list(query.select):
                    select = [{"name": s.name} for s in query.select]
                else:
                    select = {"name": query.select.name}

                return Data(data=unwrap(data),
                            select=select,
                            meta={"format": "cube"})

            if not result.data:
                edges = []
                dims = []
                for i, e in enumerate(query.edges + query.groupby):
                    allowNulls = coalesce(e.allowNulls, True)

                    if e.domain.type == "set" and e.domain.partitions:
                        domain = SimpleSetDomain(
                            partitions=e.domain.partitions.name)
                    elif e.domain.type == "range":
                        domain = e.domain
                    elif is_op(e.value, TupleOp):
                        pulls = jx.sort([
                            c for c in index_to_columns.values()
                            if c.push_name == e.name
                        ], "push_child").pull
                        parts = [
                            tuple(p(d) for p in pulls) for d in result.data
                        ]
                        domain = SimpleSetDomain(
                            partitions=jx.sort(set(parts)))
                    else:
                        domain = SimpleSetDomain(partitions=[])

                    dims.append(1 if allowNulls else 0)
                    edges.append(
                        Data(name=e.name, allowNulls=allowNulls,
                             domain=domain))

                data = {}
                for si, s in enumerate(listwrap(query.select)):
                    if s.aggregate == "count":
                        data[s.name] = Matrix(dims=dims, zeros=0)
                    else:
                        data[s.name] = Matrix(dims=dims)

                if is_list(query.select):
                    select = [{"name": s.name} for s in query.select]
                else:
                    select = {"name": query.select.name}

                return Data(meta={"format": "cube"},
                            edges=edges,
                            select=select,
                            data={k: v.cube
                                  for k, v in data.items()})

            columns = None

            edges = []
            dims = []
            for g in query.groupby:
                g.is_groupby = True

            for i, e in enumerate(query.edges + query.groupby):
                allowNulls = coalesce(e.allowNulls, True)

                if e.domain.type == "set" and e.domain.partitions:
                    domain = SimpleSetDomain(
                        partitions=e.domain.partitions.name)
                elif e.domain.type == "range":
                    domain = e.domain
                elif e.domain.type == "time":
                    domain = wrap(mo_json.scrub(e.domain))
                elif e.domain.type == "duration":
                    domain = wrap(mo_json.scrub(e.domain))
                elif is_op(e.value, TupleOp):
                    pulls = jx.sort([
                        c for c in index_to_columns.values()
                        if c.push_name == e.name
                    ], "push_child").pull
                    parts = [tuple(p(d) for p in pulls) for d in result.data]
                    domain = SimpleSetDomain(partitions=jx.sort(set(parts)))
                else:
                    if not columns:
                        columns = transpose(*result.data)
                    parts = set(columns[i])
                    if e.is_groupby and None in parts:
                        allowNulls = True
                    parts -= {None}

                    if query.sort[i].sort == -1:
                        domain = SimpleSetDomain(
                            partitions=wrap(sorted(parts, reverse=True)))
                    else:
                        domain = SimpleSetDomain(partitions=jx.sort(parts))

                dims.append(len(domain.partitions) + (1 if allowNulls else 0))
                edges.append(
                    Data(name=e.name, allowNulls=allowNulls, domain=domain))

            data_cubes = {}
            for si, s in enumerate(listwrap(query.select)):
                if s.aggregate == "count":
                    data_cubes[s.name] = Matrix(dims=dims, zeros=0)
                else:
                    data_cubes[s.name] = Matrix(dims=dims)

            r2c = index_to_coordinate(
                dims)  # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM
            for rownum, row in enumerate(result.data):
                coord = r2c(rownum)

                for i, s in enumerate(index_to_columns.values()):
                    if s.is_edge:
                        continue
                    if s.push_child == ".":
                        data_cubes[s.push_name][coord] = s.pull(row)
                    else:
                        data_cubes[s.push_name][coord][s.push_child] = s.pull(
                            row)

            if query.select == None:
                select = Null
            elif is_list(query.select):
                select = [{"name": s.name} for s in query.select]
            else:
                select = {"name": query.select.name}

            return Data(meta={"format": "cube"},
                        edges=edges,
                        select=select,
                        data={k: v.cube
                              for k, v in data_cubes.items()})
        elif query.format == "table" or (not query.format and query.groupby):
            column_names = [None
                            ] * (max(c.push_column
                                     for c in index_to_columns.values()) + 1)
            for c in index_to_columns.values():
                column_names[c.push_column] = c.push_column_name
            data = []
            for d in result.data:
                row = [None for _ in column_names]
                for s in index_to_columns.values():
                    if s.push_child == ".":
                        row[s.push_column] = s.pull(d)
                    elif s.num_push_columns:
                        tuple_value = row[s.push_column]
                        if tuple_value == None:
                            tuple_value = row[
                                s.push_column] = [None] * s.num_push_columns
                        tuple_value[s.push_child] = s.pull(d)
                    elif row[s.push_column] == None:
                        row[s.push_column] = Data()
                        row[s.push_column][s.push_child] = s.pull(d)
                    else:
                        row[s.push_column][s.push_child] = s.pull(d)
                data.append(tuple(unwrap(r) for r in row))

            output = Data(meta={"format": "table"},
                          header=column_names,
                          data=data)
        elif query.format == "list" or (not query.edges and not query.groupby):
            if not query.edges and not query.groupby and any(
                    listwrap(query.select).aggregate):
                if is_list(query.select):
                    data = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            if data[c.push_name] == None:
                                data[c.push_name] = c.pull(result.data[0])
                            elif is_list(data[c.push_name]):
                                data[c.push_name].append(c.pull(
                                    result.data[0]))
                            else:
                                data[c.push_name] = [
                                    data[c.push_name],
                                    c.pull(result.data[0])
                                ]
                        else:
                            data[c.push_name][c.push_child] = c.pull(
                                result.data[0])

                    output = Data(meta={"format": "value"}, data=data)
                else:
                    data = Data()
                    for s in index_to_columns.values():
                        if not data[s.push_child]:
                            data[s.push_child] = s.pull(result.data[0])
                        else:
                            data[s.push_child] += [s.pull(result.data[0])]
                    output = Data(meta={"format": "value"}, data=unwrap(data))
            else:
                data = []
                for rownum in result.data:
                    row = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            row[c.push_name] = c.pull(rownum)
                        elif c.num_push_columns:
                            tuple_value = row[c.push_name]
                            if not tuple_value:
                                tuple_value = row[
                                    c.push_name] = [None] * c.num_push_columns
                            tuple_value[c.push_child] = c.pull(rownum)
                        else:
                            row[c.push_name][c.push_child] = c.pull(rownum)

                    data.append(row)

                output = Data(meta={"format": "list"}, data=data)
        else:
            Log.error("unknown format {{format}}", format=query.format)

        return output
 def column_names(self):
     return listwrap(self.select).name + self.edges.name + self.groupby.name
Exemple #58
0
def get_selects(query):
    schema = query.frum.schema
    split_select = {".": ESSelect(".")}

    def get_select(path):
        es_select = split_select.get(path)
        if not es_select:
            es_select = split_select[path] = ESSelect(path)
        return es_select

    selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()
    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(
                    select.name, relative_field(untype_path(c.name), term.var))
                if c.jx_type == NESTED:
                    get_select(".").set_op = True
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": ".",
                        },
                        "pull": get_pull_source(c.es_column),
                    })
                    put_index += 1
                else:
                    get_select(c.nested_path[0]).fields.append(c.es_column)
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": ".",
                        },
                    })
                    put_index += 1
        elif is_op(select.value, Variable):
            s_column = select.value.var

            if s_column == ".":
                # PULL ALL SOURCE
                get_select(".").set_op = True
                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                    "pull": get_pull_source("."),
                })
                continue

            leaves = schema.leaves(s_column)  # LEAVES OF OBJECT
            # nested_selects = {}
            if leaves:
                if any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    get_select(".").set_op = True
                    for c in leaves:
                        if (
                                len(c.nested_path) == 1
                        ):  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES
                            pre_child = join_field(
                                decode_property(n)
                                for n in split_field(c.name))
                            new_select.append({
                                "name":
                                select.name,
                                "value":
                                Variable(c.es_column),
                                "put": {
                                    "name":
                                    select.name,
                                    "index":
                                    put_index,
                                    "child":
                                    untype_path(
                                        relative_field(pre_child, s_column)),
                                },
                                "pull":
                                get_pull_source(c.es_column),
                            })
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        c_nested_path = c.nested_path[0]
                        if c_nested_path == ".":
                            if c.es_column == "_id":
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": ".",
                                    },
                                    "pull":
                                    lambda row: row._id,
                                })
                            elif c.jx_type == NESTED:
                                get_select(".").set_op = True
                                pre_child = join_field(
                                    decode_property(n)
                                    for n in split_field(c.name))
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name":
                                        select.name,
                                        "index":
                                        put_index,
                                        "child":
                                        untype_path(
                                            relative_field(
                                                pre_child, s_column)),
                                    },
                                    "pull":
                                    get_pull_source(c.es_column),
                                })
                            else:
                                get_select(c_nested_path).fields.append(
                                    c.es_column)
                                pre_child = join_field(
                                    decode_property(n)
                                    for n in split_field(c.name))
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name":
                                        select.name,
                                        "index":
                                        put_index,
                                        "child":
                                        untype_path(
                                            relative_field(
                                                pre_child, s_column)),
                                    },
                                })
                        else:
                            es_select = get_select(c_nested_path)
                            es_select.fields.append(c.es_column)

                            child = relative_field(
                                untype_path(
                                    relative_field(c.name,
                                                   schema.query_path[0])),
                                s_column,
                            )
                            pull = accumulate_nested_doc(
                                c_nested_path,
                                Variable(
                                    relative_field(
                                        s_column, unnest_path(c_nested_path))),
                            )
                            new_select.append({
                                "name": select.name,
                                "value": select.value,
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": child,
                                },
                                "pull": pull,
                            })
            else:
                new_select.append({
                    "name": select.name,
                    "value": Variable("$dummy"),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                })
            put_index += 1
        else:
            split_scripts = split_expression_by_path(select.value,
                                                     schema,
                                                     lang=Painless)
            for p, script in split_scripts.items():
                es_select = get_select(p)
                es_select.scripts[select.name] = {
                    "script":
                    text(Painless[first(script)].partial_eval().to_es_script(
                        schema))
                }
                new_select.append({
                    "name":
                    select.name,
                    "pull":
                    jx_expression_to_function("fields." +
                                              literal_field(select.name)),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                })
                put_index += 1
    for n in new_select:
        if n.pull:
            continue
        elif is_op(n.value, Variable):
            if get_select(".").set_op:
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(
                    concat_field("fields", literal_field(n.value.var)))
        else:
            Log.error("Do not know what to do")
    return new_select, split_select
 def values(self):
     d = self._internal_dict
     return listwrap(list(d.values()))
Exemple #60
0
    def update(self, command):
        """
        :param command:  EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT
        """
        command = wrap(command)

        # REJECT DEEP UPDATES
        touched_columns = command.set.keys() | set(listwrap(command['clear']))
        for c in self.schema.columns:
            if c.name in touched_columns and len(c.nested_path) > 1:
                Log.error("Deep update not supported")

        # ADD NEW COLUMNS
        where = jx_expression(command.where)
        _vars = where.vars()
        _map = {
            v: c.es_column
            for v in _vars for c in self.columns.get(v, Null)
            if c.jx_type not in STRUCT
        }
        where_sql = where.map(_map).to_sql(self.schema)
        new_columns = set(command.set.keys()) - set(self.columns.keys())
        for new_column_name in new_columns:
            nested_value = command.set[new_column_name]
            ctype = get_jx_type(nested_value)
            column = Column(name=new_column_name,
                            jx_type=ctype,
                            es_index=self.name,
                            es_type=json_type_to_sqlite_type(ctype),
                            es_column=typed_column(new_column_name, ctype),
                            last_updated=Date.now())
            self.add_column(column)

        # UPDATE THE NESTED VALUES
        for nested_column_name, nested_value in command.set.items():
            if get_jx_type(nested_value) == "nested":
                nested_table_name = concat_field(self.name, nested_column_name)
                nested_table = nested_tables[nested_column_name]
                self_primary_key = sql_list(
                    quote_column(c.es_column) for u in self.uid
                    for c in self.columns[u])
                extra_key_name = UID + text(len(self.uid))
                extra_key = [e
                             for e in nested_table.columns[extra_key_name]][0]

                sql_command = (
                    SQL_DELETE + SQL_FROM + quote_column(nested_table.name) +
                    SQL_WHERE + "EXISTS" +
                    sql_iso(SQL_SELECT + SQL_ONE + SQL_FROM +
                            sql_alias(quote_column(nested_table.name), "n") +
                            SQL_INNER_JOIN +
                            sql_iso(SQL_SELECT + self_primary_key + SQL_FROM +
                                    quote_column(abs_schema.fact) + SQL_WHERE +
                                    where_sql) + " t ON " +
                            SQL_AND.join(
                                quote_column("t", c.es_column) + SQL_EQ +
                                quote_column("n", c.es_column)
                                for u in self.uid for c in self.columns[u])))
                self.db.execute(sql_command)

                # INSERT NEW RECORDS
                if not nested_value:
                    continue

                doc_collection = {}
                for d in listwrap(nested_value):
                    nested_table.flatten(d,
                                         Data(),
                                         doc_collection,
                                         path=nested_column_name)

                prefix = SQL_INSERT + quote_column(nested_table.name) + sql_iso(
                    sql_list([self_primary_key] + [quote_column(extra_key)] + [
                        quote_column(c.es_column)
                        for c in doc_collection.get(".", Null).active_columns
                    ]))

                # BUILD THE PARENT TABLES
                parent = (SQL_SELECT + self_primary_key + SQL_FROM +
                          quote_column(abs_schema.fact) + SQL_WHERE +
                          jx_expression(command.where).to_sql(schema))

                # BUILD THE RECORDS
                children = SQL_UNION_ALL.join(
                    SQL_SELECT +
                    sql_alias(quote_value(i), extra_key.es_column) +
                    SQL_COMMA + sql_list(
                        sql_alias(quote_value(row[c.name]),
                                  quote_column(c.es_column))
                        for c in doc_collection.get(".", Null).active_columns)
                    for i, row in enumerate(
                        doc_collection.get(".", Null).rows))

                sql_command = (prefix + SQL_SELECT + sql_list([
                    quote_column("p", c.es_column) for u in self.uid
                    for c in self.columns[u]
                ] + [quote_column("c", extra_key)] + [
                    quote_column("c", c.es_column)
                    for c in doc_collection.get(".", Null).active_columns
                ]) + SQL_FROM + sql_iso(parent) + " p" + SQL_INNER_JOIN +
                               sql_iso(children) + " c" + SQL_ON + SQL_TRUE)

                self.db.execute(sql_command)

                # THE CHILD COLUMNS COULD HAVE EXPANDED
                # ADD COLUMNS TO SELF
                for n, cs in nested_table.columns.items():
                    for c in cs:
                        column = Column(name=c.name,
                                        jx_type=c.jx_type,
                                        es_type=c.es_type,
                                        es_index=c.es_index,
                                        es_column=c.es_column,
                                        nested_path=[nested_column_name] +
                                        c.nested_path,
                                        last_updated=Date.now())
                        if c.name not in self.columns:
                            self.columns[column.name] = {column}
                        elif c.jx_type not in [
                                c.jx_type for c in self.columns[c.name]
                        ]:
                            self.columns[column.name].add(column)

        command = (
            SQL_UPDATE + quote_column(abs_schema.fact) + SQL_SET + sql_list([
                quote_column(c) + SQL_EQ +
                quote_value(get_if_type(v, c.jx_type))
                for k, v in command.set.items() if get_jx_type(v) != "nested"
                for c in self.columns[k]
                if c.jx_type != "nested" and len(c.nested_path) == 1
            ] + [
                quote_column(c) + SQL_EQ + SQL_NULL
                for k in listwrap(command['clear']) if k in self.columns
                for c in self.columns[k]
                if c.jx_type != "nested" and len(c.nested_path) == 1
            ]) + SQL_WHERE + where_sql)

        self.db.execute(command)