def append_query(self, es_query, start):
        self.start = start
        domain = self.domain

        domain_key = domain.key
        include, text_include = transpose(
            *((float(v) if isinstance(v, (int, float)) else v,
               text_type(float(v)) if isinstance(v, (int, float)) else v)
              for v in (p[domain_key] for p in domain.partitions)))
        value = self.edge.value
        exists = AndOp(
            "and",
            [value.exists(),
             InOp("in", [value, Literal("literal", include)])]).partial_eval()

        limit = coalesce(self.limit, len(domain.partitions))

        if isinstance(value, Variable):
            es_field = first(self.query.frum.schema.leaves(
                value.var)).es_column  # ALREADY CHECKED THERE IS ONLY ONE
            terms = set_default(
                {
                    "terms": {
                        "field": es_field,
                        "size": limit,
                        "order": {
                            "_term": self.sorted
                        } if self.sorted else None
                    }
                }, es_query)
        else:
            terms = set_default(
                {
                    "terms": {
                        "script":
                        value.to_es14_script(self.schema).script(self.schema),
                        "size":
                        limit
                    }
                }, es_query)

        if self.edge.allowNulls:
            missing = set_default(
                {"filter": NotOp("not", exists).to_es14_filter(self.schema)},
                es_query)
        else:
            missing = None

        return wrap({
            "aggs": {
                "_match": {
                    "filter": exists.to_es14_filter(self.schema),
                    "aggs": {
                        "_filter": terms
                    }
                },
                "_missing": missing
            }
        })
def to_es14_filter(self, schema):
    if not self.suffix:
        return {"match_all": {}}
    elif isinstance(self.expr, Variable) and isinstance(self.suffix, Literal):
        var = first(schema.leaves(self.expr.var)).es_column
        return {"regexp": {var: ".*"+string2regexp(self.suffix.value)}}
    else:
        return ScriptOp("script",  self.to_es14_script(schema).script(schema)).to_es14_filter(schema)
def to_es14_filter(self, schema):
    if not self.expr:
        return {"match_all": {}}
    elif isinstance(self.expr, Variable) and isinstance(self.prefix, Literal):
        var = first(schema.leaves(self.expr.var)).es_column
        return {"prefix": {var: self.prefix.value}}
    else:
        return ScriptOp("script",  self.to_es14_script(schema).script(schema)).to_es14_filter(schema)
def to_es14_filter(self, schema):
    if isinstance(self.value, Variable):
        var = self.value.var
        cols = schema.leaves(var)
        if cols:
            var = first(cols).es_column
        return {"terms": {var: self.superset.value}}
    else:
        return ScriptOp("script",  self.to_es14_script(schema).script(schema)).to_es14_filter(schema)
def to_es14_filter(self, schema):
    if isinstance(self.term, MissingOp) and isinstance(self.term.expr, Variable):
        v = self.term.expr.var
        cols = schema.leaves(v)
        if cols:
            v = first(cols).es_column
        return {"exists": {"field": v}}
    else:
        operand = self.term.to_es14_filter(schema)
        return es_not(operand)
def to_es14_filter(self, schema):
    if isinstance(self.pattern, Literal) and isinstance(self.var, Variable):
        cols = schema.leaves(self.var.var)
        if len(cols) == 0:
            return MATCH_NONE
        elif len(cols) == 1:
            return {"regexp": {first(cols).es_column: self.pattern.value}}
        else:
            Log.error("regex on not supported ")
    else:
        Log.error("regex only accepts a variable and literal pattern")
def to_es14_filter(self, schema):
    if isinstance(self.expr, Variable):
        cols = schema.leaves(self.expr.var)
        if not cols:
            return {"match_all": {}}
        elif len(cols) == 1:
            return es_missing(first(cols).es_column)
        else:
            return es_and([
                es_missing(c.es_column) for c in cols
            ])
    else:
        return ScriptOp("script", self.to_es14_script(schema).script(schema)).to_es14_filter(schema)
def to_es14_filter(self, schema):
    if isinstance(self.lhs, Variable) and isinstance(self.rhs, Literal):
        cols = schema.leaves(self.lhs.var)
        if not cols:
            lhs = self.lhs.var  # HAPPENS DURING DEBUGGING, AND MAYBE IN REAL LIFE TOO
        elif len(cols) == 1:
            lhs = first(cols).es_column
        else:
            Log.error("operator {{op|quote}} does not work on objects", op=self.op)
        return {"range": {lhs: {self.op: self.rhs.value}}}
    else:
        script = self.to_es14_script(schema)
        if script.miss is not FALSE:
            Log.error("inequality must be decisive")
        return {"script": es_script(script.expr)}
def to_es14_filter(self, schema):
    if isinstance(self.lhs, Variable) and isinstance(self.rhs, Literal):
        lhs = self.lhs.var
        cols = schema.leaves(lhs)
        if cols:
            lhs = first(cols).es_column
        rhs = self.rhs.value
        if isinstance(rhs, list):
            if len(rhs) == 1:
                return {"term": {lhs: rhs[0]}}
            else:
                return {"terms": {lhs: rhs}}
        else:
            return {"term": {lhs: rhs}}
    else:
        return self.to_es14_script(schema).to_es14_filter(schema)
def _range_composer(edge, domain, es_query, to_float, schema):
    # USE RANGES
    _min = coalesce(domain.min, MIN(domain.partitions.min))
    _max = coalesce(domain.max, MAX(domain.partitions.max))

    if edge.allowNulls:
        missing_filter = set_default(
            {
                "filter":
                NotOp(
                    "not",
                    AndOp("and", [
                        edge.value.exists(),
                        InequalityOp(
                            "gte", [edge.value,
                                    Literal(None, to_float(_min))]),
                        InequalityOp(
                            "lt", [edge.value,
                                   Literal(None, to_float(_max))])
                    ]).partial_eval()).to_es14_filter(schema)
            }, es_query)
    else:
        missing_filter = None

    if isinstance(edge.value, Variable):
        calc = {"field": first(schema.leaves(edge.value.var)).es_column}
    else:
        calc = {"script": edge.value.to_es14_script(schema).script(schema)}

    return wrap({
        "aggs": {
            "_match":
            set_default({"range": calc}, {
                "range": {
                    "ranges": [{
                        "from": to_float(p.min),
                        "to": to_float(p.max)
                    } for p in domain.partitions]
                }
            }, es_query),
            "_missing":
            missing_filter
        }
    })
    def append_query(self, es_query, start):
        self.start = start

        es_field = first(self.query.frum.schema.leaves(self.var)).es_column
        es_query = wrap({
            "aggs": {
                "_match":
                set_default(
                    {
                        "terms": {
                            "script":
                            expand_template(LIST_TO_PIPE, {
                                "expr":
                                'doc[' + quote(es_field) + '].values'
                            })
                        }
                    }, es_query)
            }
        })

        return es_query
def to_es14_script(self, schema, not_null=False, boolean=False, many=True):
    if isinstance(self.expr, Variable):
        if self.expr.var == "_id":
            return EsScript(type=BOOLEAN, expr="false", frum=self)
        else:
            columns = schema.leaves(self.expr.var)
            if len(columns) == 1:
                return EsScript(type=BOOLEAN, expr="doc[" + quote(first(columns).es_column) + "].isEmpty()", frum=self)
            else:
                return AndOp("and", [
                    EsScript(
                        type=BOOLEAN,
                        expr="doc[" + quote(c.es_column) + "].isEmpty()",
                        frum=self
                    )
                    for c in columns
                ]).partial_eval().to_es14_script(schema)
    elif isinstance(self.expr, Literal):
        return self.expr.missing().to_es14_script(schema)
    else:
        return self.expr.missing().partial_eval().to_es14_script(schema)
def to_es14_filter(self, schema):
    if isinstance(self.lhs, Variable) and isinstance(self.rhs, Literal):
        rhs = self.rhs.value
        lhs = self.lhs.var
        cols = schema.leaves(lhs)
        if cols:
            lhs = first(cols).es_column

        if isinstance(rhs, list):
            if len(rhs) == 1:
                return {"term": {lhs: rhs[0]}}
            else:
                return {"terms": {lhs: rhs}}
        else:
            return {"term": {lhs: rhs}}

    else:
        return CaseOp("case", [
            WhenOp("when", self.lhs.missing(), **{"then": self.rhs.missing()}),
            WhenOp("when", self.rhs.missing(), **{"then": FALSE}),
            BasicEqOp("eq", [self.lhs, self.rhs])
        ]).partial_eval().to_es14_filter(schema)
    def append_query(self, es_query, start):
        # TODO: USE "reverse_nested" QUERY TO PULL THESE
        self.start = start
        for i, v in enumerate(self.fields):
            exists = v.exists().partial_eval()
            nest = wrap({
                "aggs": {
                    "_match": {
                        "filter": exists.to_es14_filter(self.schema),
                        "aggs": {
                            "_filter":
                            set_default(
                                {
                                    "terms": {
                                        "field": first(
                                            self.schema.leaves(
                                                v.var)).es_column,
                                        "size": self.domain.limit
                                    }
                                }, es_query)
                        }
                    }
                }
            })
            nest.aggs._missing = set_default(
                {"filter": NotOp("not", exists).to_es14_filter(self.schema)},
                es_query)
            es_query = nest

        if self.domain.where:
            filter_ = self.domain.where.partial_eval().to_es14_filter(
                self.schema)
            es_query = {
                "aggs": {
                    "_filter": set_default({"filter": filter_}, es_query)
                }
            }

        return es_query
def to_es14_filter(self, schema):
    if isinstance(self.lhs, Variable) and isinstance(self.rhs, Literal):
        columns = schema.values(self.lhs.var)
        if len(columns) == 0:
            return {"match_all": {}}
        elif len(columns) == 1:
            return es_not({"term": {first(columns).es_column: self.rhs.value}})
        else:
            Log.error("column split to multiple, not handled")
    else:
        lhs = self.lhs.partial_eval().to_es14_script(schema)
        rhs = self.rhs.partial_eval().to_es14_script(schema)

        if lhs.many:
            if rhs.many:
                return es_not(
                    ScriptOp(
                        "script",
                        (
                            "(" + lhs.expr + ").size()==(" + rhs.expr + ").size() && " +
                            "(" + rhs.expr + ").containsAll(" + lhs.expr + ")"
                        )
                    ).to_es14_filter(schema)
                )
            else:
                return es_not(
                    ScriptOp("script", "(" + lhs.expr + ").contains(" + rhs.expr + ")").to_es14_filter(schema)
                )
        else:
            if rhs.many:
                return es_not(
                    ScriptOp("script", "(" + rhs.expr + ").contains(" + lhs.expr + ")").to_es14_filter(schema)
                )
            else:
                return es_not(
                    ScriptOp("script", "(" + lhs.expr + ") != (" + rhs.expr + ")").to_es14_filter(schema)
                )
Exemple #16
0
def es_aggsop(es, frum, query):
    query = query.copy()  # WE WILL MARK UP THIS QUERY
    schema = frum.schema
    select = listwrap(query.select)

    es_query = Data()
    new_select = Data()  # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING
    formula = []
    for s in select:
        if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".":
            if schema.query_path == ".":
                s.pull = jx_expression_to_function("doc_count")
            else:
                s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]})
        elif isinstance(s.value, Variable):
            if s.aggregate == "count":
                new_select["count_"+literal_field(s.value.var)] += [s]
            else:
                new_select[literal_field(s.value.var)] += [s]
        elif s.aggregate:
            formula.append(s)

    for canonical_name, many in new_select.items():
        for s in many:
            columns = frum.schema.values(s.value.var)

            if s.aggregate == "count":
                canonical_names = []
                for column in columns:
                    cn = literal_field(column.es_column + "_count")
                    if column.jx_type == EXISTS:
                        canonical_names.append(cn + ".doc_count")
                        es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}}
                    else:
                        canonical_names.append(cn+ ".value")
                    es_query.aggs[cn].value_count.field = column.es_column
                if len(canonical_names) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0])
                else:
                    s.pull = jx_expression_to_function({"add": canonical_names})
            elif s.aggregate == "median":
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")

                es_query.aggs[key].percentiles.field = first(columns).es_column
                es_query.aggs[key].percentiles.percents += [50]
                s.pull = jx_expression_to_function(key + ".values.50\\.0")
            elif s.aggregate == "percentile":
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")
                if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile:
                    Log.error("Expecting percentile to be a float from 0.0 to 1.0")
                percent = Math.round(s.percentile * 100, decimal=6)

                es_query.aggs[key].percentiles.field = first(columns).es_column
                es_query.aggs[key].percentiles.percents += [percent]
                es_query.aggs[key].percentiles.compression = 2
                s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
            elif s.aggregate == "cardinality":
                canonical_names = []
                for column in columns:
                    cn = literal_field(column.es_column + "_cardinality")
                    canonical_names.append(cn)
                    es_query.aggs[cn].cardinality.field = column.es_column
                if len(columns) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0] + ".value")
                else:
                    s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0})
            elif s.aggregate == "stats":
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # REGULAR STATS
                stats_name = literal_field(canonical_name)
                es_query.aggs[stats_name].extended_stats.field = first(columns).es_column

                # GET MEDIAN TOO!
                median_name = literal_field(canonical_name + "_percentile")
                es_query.aggs[median_name].percentiles.field = first(columns).es_column
                es_query.aggs[median_name].percentiles.percents += [50]

                s.pull = get_pull_stats(stats_name, median_name)
            elif s.aggregate == "union":
                pulls = []
                for column in columns:
                    stats_name = encode_property(column.es_column)

                    if column.nested_path[0] == ".":
                        es_query.aggs[stats_name] = {"terms": {
                            "field": column.es_column,
                            "size": Math.min(s.limit, MAX_LIMIT)
                        }}
                        pulls.append(get_bucket_keys(stats_name))

                    else:
                        es_query.aggs[stats_name] = {
                            "nested": {"path": column.nested_path[0]},
                            "aggs": {"_nested": {"terms": {
                                "field": column.es_column,
                                "size": Math.min(s.limit, MAX_LIMIT)
                            }}}
                        }
                        pulls.append(get_bucket_keys(stats_name+"._nested"))
                if len(pulls) == 0:
                    s.pull = NULL
                elif len(pulls) == 1:
                    s.pull = pulls[0]
                else:
                    s.pull = lambda row: UNION(
                        p(row)
                        for p in pulls
                    )
            else:
                if len(columns) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")

                # PULL VALUE OUT OF THE stats AGGREGATE
                es_query.aggs[literal_field(canonical_name)].extended_stats.field = first(columns).es_column
                s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]})

    for i, s in enumerate(formula):
        canonical_name = literal_field(s.name)

        if isinstance(s.value, TupleOp):
            if s.aggregate == "count":
                # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY
                s.pull = "doc_count"
            else:
                Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate)
        elif s.aggregate == "count":
            es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es14_script(schema).script(schema)
            s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value")
        elif s.aggregate == "median":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")

            es_query.aggs[key].percentiles.script = s.value.to_es14_script(schema).script(schema)
            es_query.aggs[key].percentiles.percents += [50]
            s.pull = jx_expression_to_function(key + ".values.50\\.0")
        elif s.aggregate == "percentile":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            percent = Math.round(s.percentile * 100, decimal=6)

            es_query.aggs[key].percentiles.script = s.value.to_es14_script(schema).script(schema)
            es_query.aggs[key].percentiles.percents += [percent]
            s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
        elif s.aggregate == "cardinality":
            # ES USES DIFFERENT METHOD FOR CARDINALITY
            key = canonical_name + " cardinality"

            es_query.aggs[key].cardinality.script = s.value.to_es14_script(schema).script(schema)
            s.pull = jx_expression_to_function(key + ".value")
        elif s.aggregate == "stats":
            # REGULAR STATS
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].extended_stats.script = s.value.to_es14_script(schema).script(schema)

            # GET MEDIAN TOO!
            median_name = literal_field(canonical_name + " percentile")
            es_query.aggs[median_name].percentiles.script = s.value.to_es14_script(schema).script(schema)
            es_query.aggs[median_name].percentiles.percents += [50]

            s.pull = get_pull_stats(stats_name, median_name)
        elif s.aggregate=="union":
            # USE TERMS AGGREGATE TO SIMULATE union
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].terms.script_field = s.value.to_es14_script(schema).script(schema)
            s.pull = jx_expression_to_function(stats_name + ".buckets.key")
        else:
            # PULL VALUE OUT OF THE stats AGGREGATE
            s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate])
            es_query.aggs[canonical_name].extended_stats.script = s.value.to_es14_script(schema).script(schema)

    decoders = get_decoders_by_depth(query)
    start = 0

    #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested
    split_where = split_expression_by_depth(query.where, schema=frum.schema)

    if len(split_field(frum.name)) > 1:
        if any(split_where[2::]):
            Log.error("Where clause is too deep")

        for d in decoders[1]:
            es_query = d.append_query(es_query, start)
            start += d.num_columns

        if split_where[1]:
            #TODO: INCLUDE FILTERS ON EDGES
            filter_ = AndOp("and", split_where[1]).to_es14_filter(schema)
            es_query = Data(
                aggs={"_filter": set_default({"filter": filter_}, es_query)}
            )

        es_query = wrap({
            "aggs": {"_nested": set_default(
                {"nested": {"path": schema.query_path[0]}},
                es_query
            )}
        })
    else:
        if any(split_where[1::]):
            Log.error("Where clause is too deep")

    if decoders:
        for d in jx.reverse(decoders[0]):
            es_query = d.append_query(es_query, start)
            start += d.num_columns

    if split_where[0]:
        #TODO: INCLUDE FILTERS ON EDGES
        filter = AndOp("and", split_where[0]).to_es14_filter(schema)
        es_query = Data(
            aggs={"_filter": set_default({"filter": filter}, es_query)}
        )
    # </TERRIBLE SECTION>

    if not es_query:
        es_query = wrap({"query": {"match_all": {}}})

    es_query.size = 0

    with Timer("ES query time", silent=True) as es_duration:
        result = es_post(es, es_query, query.limit)

    try:
        format_time = Timer("formatting", silent=True)
        with format_time:
            decoders = [d for ds in decoders for d in ds]
            result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total)  # IT APPEARS THE OLD doc_count IS GONE

            formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format]
            if query.edges:
                output = formatter(decoders, result.aggregations, start, query, select)
            elif query.groupby:
                output = groupby_formatter(decoders, result.aggregations, start, query, select)
            else:
                output = aggop_formatter(decoders, result.aggregations, start, query, select)

        output.meta.timing.formatting = format_time.duration
        output.meta.timing.es_search = es_duration.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        if query.format not in format_dispatch:
            Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e)
        Log.error("Some problem", cause=e)
    def append_query(self, es_query, start):
        self.start = start

        if not isinstance(self.edge.value, Variable):
            if self.exists is TRUE:
                # IF True THEN WE DO NOT NEED THE _filter OR THE _missing (THIS RARELY HAPPENS THOUGH)
                output = wrap({
                    "aggs": {
                        "_match":
                        set_default(
                            {
                                "terms": {
                                    "script": self.script.expr,
                                    "size": self.domain.limit,
                                    "order": self.es_order
                                }
                            }, es_query)
                    }
                })
            else:
                output = wrap({
                    "aggs": {
                        "_match":
                        {  # _match AND _filter REVERSED SO _match LINES UP WITH _missing
                            "filter": self.exists.to_es14_filter(self.schema),
                            "aggs": {
                                "_filter":
                                set_default(
                                    {
                                        "terms": {
                                            "script": self.script.expr,
                                            "size": self.domain.limit,
                                            "order": self.es_order
                                        }
                                    }, es_query)
                            }
                        },
                        "_missing":
                        set_default(
                            {
                                "filter": self.missing.to_es14_filter(
                                    self.schema)
                            }, es_query)
                    }
                })
            return output
        else:
            output = wrap({
                "aggs": {
                    "_match":
                    set_default(
                        {
                            "terms": {
                                "field":
                                first(self.schema.leaves(
                                    self.edge.value.var)).es_column,
                                "size":
                                self.domain.limit,
                                "order":
                                self.es_order
                            }
                        }, es_query),
                    "_missing":
                    set_default(
                        {"filter": self.missing.to_es14_filter(self.schema)},
                        es_query)
                }
            })
            return output