コード例 #1
0
ファイル: query.py プロジェクト: mozilla/cia-tasks
def _normalize_select_no_context(select, schema=None):
    """
    SAME NORMALIZE, BUT NO SOURCE OF COLUMNS
    """
    if not _Column:
        _late_import()

    if is_text(select):
        select = Data(value=select)
    else:
        select = wrap(select)

    output = select.copy()
    if not select.value:
        output.name = coalesce(select.name, select.aggregate)
        if output.name:
            output.value = jx_expression(".", schema=schema)
        elif len(select):
            Log.error(BAD_SELECT, select=select)
        else:
            return Null
    elif is_text(select.value):
        if select.value.endswith(".*"):
            name = select.value[:-2].lstrip(".")
            output.name = coalesce(select.name, name)
            output.value = LeavesOp(Variable(name),
                                    prefix=coalesce(select.prefix, name))
        else:
            if select.value == ".":
                output.name = coalesce(select.name, select.aggregate, ".")
                output.value = jx_expression(select.value, schema=schema)
            elif select.value == "*":
                output.name = coalesce(select.name, select.aggregate, ".")
                output.value = LeavesOp(Variable("."))
            else:
                output.name = coalesce(select.name, select.value.lstrip("."),
                                       select.aggregate)
                output.value = jx_expression(select.value, schema=schema)
    elif is_number(output.value):
        if not output.name:
            output.name = text(output.value)
        output.value = jx_expression(select.value, schema=schema)
    else:
        output.value = jx_expression(select.value, schema=schema)

    if not output.name:
        Log.error("expecting select to have a name: {{select}}", select=select)
    if output.name.endswith(".*"):
        Log.error("{{name|quote}} is invalid select", name=output.name)

    output.aggregate = coalesce(canonical_aggregates[select.aggregate].name,
                                select.aggregate, "none")
    output.default = coalesce(select.default,
                              canonical_aggregates[output.aggregate].default)
    return output
コード例 #2
0
def _normalize_select_no_context(select, schema=None):
    """
    SAME NORMALIZE, BUT NO SOURCE OF COLUMNS
    """
    if not _Column:
        _late_import()

    if isinstance(select, basestring):
        select = Data(value=select)
    else:
        select = wrap(select)

    output = select.copy()
    if not select.value:
        output.name = coalesce(select.name, select.aggregate)
        if output.name:
            output.value = jx_expression(".")
        else:
            return output
    elif isinstance(select.value, basestring):
        if select.value.endswith(".*"):
            output.name = coalesce(select.name, select.value[:-2],
                                   select.aggregate)
            output.value = LeavesOp("leaves", Variable(select.value[:-2]))
        else:
            if select.value == ".":
                output.name = coalesce(select.name, select.aggregate, ".")
                output.value = jx_expression(select.value)
            elif select.value == "*":
                output.name = coalesce(select.name, select.aggregate, ".")
                output.value = LeavesOp("leaves", Variable("."))
            else:
                output.name = coalesce(select.name, select.value,
                                       select.aggregate)
                output.value = jx_expression(select.value)
    elif isinstance(select.value, (int, float)):
        if not output.name:
            output.name = text_type(select.value)
        output.value = jx_expression(select.value)
    else:
        output.value = jx_expression(select.value)

    if not output.name:
        Log.error("expecting select to have a name: {{select}}", select=select)
    if output.name.endswith(".*"):
        Log.error("{{name|quote}} is invalid select", name=output.name)

    output.aggregate = coalesce(canonical_aggregates[select.aggregate].name,
                                select.aggregate, "none")
    output.default = coalesce(select.default,
                              canonical_aggregates[output.aggregate].default)
    return output
コード例 #3
0
ファイル: utils.py プロジェクト: klahnakoski/pyLibrary
def query_to_outer_joins(query, all_paths, split_select, var_to_columns):
    """
    CONVERT FROM JSON QUERY EXPRESSION TO A NUMBER OF OUTER JOINS
    :param frum:
    :param expr:
    :param all_paths:
    :param var_to_columns:
    :return:
    """

    frum = query.frum
    where = query.where
    query_path = frum.schema.query_path[0]

    # MAP TO es_columns, INCLUDE NESTED EXISTENCE IN EACH VARIABLE
    wheres = split_nested_inner_variables(where, query_path, var_to_columns)
    concat_outer_and = _split_expression(wheres, frum.schema, all_paths)

    # ATTACH SELECTS
    output = []
    for concat in concat_outer_and:
        nests = []
        for p, nest in zip(all_paths, concat):
            select = coalesce(split_select.get(p), NULL)
            nests.append(
                NestedOp(Variable(p), select=select, where=AndOp(nest)))
        outer = OuterJoinOp(frum, nests).partial_eval()
        if outer is not NULL:
            output.append(outer)

    return ConcatOp(output)
コード例 #4
0
def to_ruby(self, schema):
    if self.var == ".":
        return "_source"
    else:
        if self.var == "_id":
            return Ruby(
                type=STRING,
                expr=
                'doc["_uid"].value.substring(doc["_uid"].value.indexOf(\'#\')+1)',
                frum=self)

        columns = schema.values(self.var)
        acc = []
        for c in columns:
            varname = c.es_column
            frum = Variable(c.es_column)
            q = quote(varname)
            acc.append(
                Ruby(miss=frum.missing(),
                     type=c.type,
                     expr="doc[" + q + "].values",
                     frum=frum,
                     many=True))

        if len(acc) == 0:
            return NULL.to_ruby(schema)
        elif len(acc) == 1:
            return acc[0]
        else:
            return CoalesceOp("coalesce", acc).to_ruby(schema)
コード例 #5
0
ファイル: utils.py プロジェクト: klahnakoski/pyLibrary
def split_nested_inner_variables(where, focal_path, var_to_columns):
    """
    SOME VARIABLES ARE BOTH NESTED AND INNER, EXPAND QUERY TO HANDLE BOTH
    :param where:
    :param focal_path:
    :param var_to_columns:
    :return:
    """
    wheres = [where]

    # WE DO THIS EXPANSION TO CAPTURE A VARIABLE OVER DIFFERENT NESTED LEVELS
    # EXPAND VARS TO COLUMNS, MULTIPLY THE EXPRESSIONS
    for v, cols in var_to_columns.items():
        more_exprs = []
        if not cols:
            for e in wheres:
                more_exprs.append(e.map({v: NULL}))
        else:
            for c in cols:
                deepest = c.nested_path[0]
                for e in wheres:
                    if startswith_field(focal_path, deepest):
                        more_exprs.append(
                            e.map({
                                v:
                                Variable(c.es_column,
                                         type=c.jx_type,
                                         multi=c.multi)
                            }))
                    else:
                        more_exprs.append(
                            e.map({
                                v:
                                NestedOp(
                                    path=Variable(deepest),
                                    select=Variable(c.es_column),
                                    where=Variable(c.es_column).exists(),
                                )
                            }))
        wheres = more_exprs
        var_to_columns = {
            c.es_column: [c]
            for cs in var_to_columns.values() for c in cs
        }

    return OrOp(wheres)
コード例 #6
0
ファイル: utils.py プロジェクト: klahnakoski/pyLibrary
def outer_to_inner(expr, paths_to_cols):
    # JSON QUERY EXPRESSIONS ASSUME OUTER JOIN
    # ES ONLY HAS INNER JOIN
    # ACCOUNT FOR WHEN NESTED RECORDS ARE MISSING
    if expr is NULL:
        return NULL
    elif is_op(expr, ConcatOp):
        output = []
        for outer in expr.terms:
            for inner in outer_to_inner(outer, paths_to_cols).terms:
                output.append(inner)
        return ConcatOp(output)
    elif is_op(expr, OuterJoinOp):
        # THE MAIN INNER JOIN
        output = [InnerJoinOp(expr.frum, expr.nests)]
        # ALL THE OUTER JOIN RESIDUES
        for deepest in expr.nests[:-1]:  # LAST '.' NOT NEEDED
            deepest_path = deepest.path.var
            inner_join = InnerJoinOp(expr.frum, [])
            deeper_conditions = TRUE
            for nest in expr.nests:
                nest_path = nest.path.var
                if len(nest_path) < len(deepest_path):
                    new_nest = NestedOp(
                        path=nest.path,
                        select=nest.select,
                        where=AndOp([deeper_conditions, nest.where]),
                        sort=nest.sort,
                        limit=nest.limit,
                    )
                    inner_join.nests.append(new_nest)
                    deeper_conditions = TRUE
                elif nest_path == deepest_path:
                    # assume the deeper is null
                    set_null = {
                        d.es_column: NULL
                        for d in paths_to_cols[deepest_path]
                    }
                    set_null[deepest_path] = NULL
                    deeper_exists = nest.where.map(set_null).partial_eval()

                    if deeper_exists is FALSE:
                        # WHERE CAN NOT BE SATISFIED IF NESTED IS NULL
                        deeper_conditions = FALSE
                    else:
                        # ENSURE THIS IS NOT "OPTIMIZED" TO FALSE
                        deeper_conditions = NotOp(
                            NestedOp(path=Variable(nest_path),
                                     where=TRUE,
                                     select=NULL))
                        deeper_conditions.simplified = True

            inner_join = inner_join.partial_eval()
            if inner_join.missing() is not TRUE:
                output.append(inner_join)
        return ConcatOp(output)
    else:
        Log.error("do not know what to do yet")
コード例 #7
0
def get_decoders_by_path(query, schema):
    """
    RETURN MAP FROM QUERY PATH TO LIST OF DECODER ARRAYS

    :param query:
    :return:
    """
    output = {}

    if query.edges:
        if query.sort and query.format != "cube":
            # REORDER EDGES/GROUPBY TO MATCH THE SORT
            query.edges = sort_edges(query, "edges")
    elif query.groupby:
        if query.sort and query.format != "cube":
            query.groupby = sort_edges(query, "groupby")

    for edge in to_data(coalesce(query.edges, query.groupby, [])):
        limit = coalesce(edge.domain.limit, query.limit, DEFAULT_LIMIT)
        vars_ = coalesce(edge.value.vars(), set())

        if edge.range:
            vars_ |= edge.range.min.vars() | edge.range.max.vars()
            for v in vars_:
                if not schema[v.var]:
                    Log.error("{{var}} does not exist in schema", var=v)
        elif edge.domain.dimension:
            vars_ |= set(Variable(v) for v in edge.domain.dimension.fields)
            edge.domain.dimension = edge.domain.dimension.copy()
            edge.domain.dimension.fields = [
                schema[v.var].es_column for v in vars_
            ]
        elif edge.domain.partitions.where and all(
                edge.domain.partitions.where):
            for p in edge.domain.partitions:
                vars_ |= p.where.vars()
        else:
            # SIMPLE edge.value
            decoder = AggsDecoder(edge, query, limit)
            depths = set(c.nested_path[0] for v in vars_
                         for c in schema.leaves(v.var))
            output.setdefault(first(depths), []).append(decoder)
            continue

        depths = set(c.nested_path[0] for v in vars_
                     for c in schema.leaves(v.var))
        if not depths:
            Log.error("Do not know of column {{column}}",
                      column=unwraplist(
                          [v for v in vars_ if schema[v.var] == None]))
        if len(depths) > 1:
            Log.error("expression {{expr|quote}} spans tables, can not handle",
                      expr=edge.value)

        decoder = AggsDecoder(edge, query, limit)
        output.setdefault(first(depths), []).append(decoder)
    return output
コード例 #8
0
ファイル: decoders.py プロジェクト: mars-f/ActiveData
    def append_query(self, query_path, es_query):
        domain = self.domain
        domain_key = domain.key
        value = Painless[self.edge.value]
        cnv = pull_functions[value.type]
        include = tuple(cnv(p[domain_key]) for p in domain.partitions)

        exists = Painless[AndOp([InOp([value,
                                       Literal(include)])])].partial_eval()

        limit = coalesce(self.limit, len(domain.partitions))

        if is_op(value, Variable):
            es_field = first(self.query.frum.schema.leaves(
                value.var)).es_column  # ALREADY CHECKED THERE IS ONLY ONE
            match = TermsAggs(
                "_match", {
                    "field": es_field,
                    "size": limit,
                    "order": {
                        "_term": self.sorted
                    } if self.sorted else None
                }, self)
        else:
            match = TermsAggs(
                "_match", {
                    "script": text_type(value.to_es_script(self.schema)),
                    "size": limit
                }, self)
        output = Aggs().add(
            FilterAggs("_filter", exists, None).add(match.add(es_query)))

        if self.edge.allowNulls:
            # FIND NULLS AT EACH NESTED LEVEL
            for p in self.schema.query_path:
                if p == query_path:
                    # MISSING AT THE QUERY DEPTH
                    output.add(
                        NestedAggs(p).add(
                            FilterAggs("_missing0", NotOp(exists),
                                       self).add(es_query)))
                else:
                    # PARENT HAS NO CHILDREN, SO MISSING
                    column = first(
                        self.schema.values(query_path, (OBJECT, EXISTS)))
                    output.add(
                        NestedAggs(column.nested_path[0]).add(
                            FilterAggs(
                                "_missing1",
                                NotOp(
                                    ExistsOp(
                                        Variable(
                                            column.es_column.replace(
                                                NESTED_TYPE, EXISTS_TYPE)))),
                                self).add(es_query)))
        return output
コード例 #9
0
def to_sql(self, schema, not_null=False, boolean=False):
    if not isinstance(self.term, Variable):
        Log.error("Can only handle Variable")
    term = self.term.var
    prefix_length = len(split_field(term))
    return wrap([{
        "name":
        join_field(split_field(schema.get_column_name(c))[prefix_length:]),
        "sql":
        Variable(schema.get_column_name(c)).to_sql(schema)[0].sql
    } for n, cols in schema.map_to_sql(term).items() for c in cols])
コード例 #10
0
ファイル: query.py プロジェクト: klahnakoski/annotations
def _normalize_group(edge, dim_index, limit, schema=None):
    """
    :param edge: Not normalized groupby
    :param dim_index: Dimensions are ordered; this is this groupby's index into that order
    :param schema: for context
    :return: a normalized groupby
    """
    if is_text(edge):
        if edge.endswith(".*"):
            prefix = edge[:-2]
            if schema:
                output = wrap([
                    {  # BECASUE THIS IS A GROUPBY, EARLY SPLIT INTO LEAVES WORKS JUST FINE
                        "name": concat_field(prefix, literal_field(relative_field(untype_path(c.name), prefix))),
                        "put": {"name": literal_field(untype_path(c.name))},
                        "value": jx_expression(c.es_column, schema=schema),
                        "allowNulls": True,
                        "domain": {"type": "default"}
                    }
                    for c in schema.leaves(prefix)
                ])
                return output
            else:
                return wrap([{
                    "name": untype_path(prefix),
                    "put": {"name": literal_field(untype_path(prefix))},
                    "value": LeavesOp(Variable(prefix)),
                    "allowNulls": True,
                    "dim":dim_index,
                    "domain": {"type": "default"}
                }])

        return wrap([{
            "name": edge,
            "value": jx_expression(edge, schema=schema),
            "allowNulls": True,
            "dim": dim_index,
            "domain": Domain(type="default", limit=limit)
        }])
    else:
        edge = wrap(edge)
        if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None:
            Log.error("groupby does not accept complicated domains")

        if not edge.name and not is_text(edge.value):
            Log.error("You must name compound edges: {{edge}}",  edge= edge)

        return wrap([{
            "name": coalesce(edge.name, edge.value),
            "value": jx_expression(edge.value, schema=schema),
            "allowNulls": True,
            "dim":dim_index,
            "domain": {"type": "default"}
        }])
コード例 #11
0
ファイル: decoders.py プロジェクト: mars-f/ActiveData
 def append_query(self, query_path, es_query):
     decoder = self
     for i, v in enumerate(self.fields):
         nest = Aggs().add(
             TermsAggs("_match", {
                 "field": v,
                 "size": self.domain.limit
             }, decoder).add(es_query)).add(
                 FilterAggs("_missing", MissingOp(Variable(v)),
                            decoder).add(es_query))
         es_query = nest
         decoder = None
     return es_query
コード例 #12
0
ファイル: expressions.py プロジェクト: vpathak2019/jx-sqlite
def to_sql(self, schema, not_null=False, boolean=False):
    if not isinstance(self.term, Variable):
        Log.error("Can only handle Variable")
    term = self.term.var
    prefix_length = len(split_field(term))
    db_columns = []
    for n, cols in schema.map_to_sql(term).items():
        for c in cols:
            col = schema.get_column_name(c)
            if startswith_field(col, term):
                db_columns.append({
                    "name":
                    join_field(split_field(col)[prefix_length:]),
                    "sql":
                    Variable(col).to_sql(schema)[0].sql
                })
            else:
                db_columns.append({
                    "name": col,
                    "sql": Variable(col).to_sql(schema)[0].sql
                })

    return wrap(db_columns)
コード例 #13
0
def to_sql(self, schema, not_null=False, boolean=False):
    if not is_op(self.term, Variable):
        Log.error("Can only handle Variable")
    term = self.term.var
    prefix_length = len(split_field(term))
    output = wrap([{
        "name":
        join_field(split_field(schema.get_column_name(c))[prefix_length:]),
        "sql":
        Variable(schema.get_column_name(c)).to_sql(schema)[0].sql
    } for c in schema.columns if startswith_field(c.name, term) and (
        (c.jx_type not in (EXISTS, OBJECT, NESTED) and startswith_field(
            schema.nested_path[0], c.nested_path[0])) or (c.jx_type not in (
                EXISTS, OBJECT) and schema.nested_path[0] == c.nested_path[0]))
                   ])
    return output
コード例 #14
0
def es_query_proto(selects, op, wheres, schema):
    """
    RETURN AN ES QUERY
    :param selects: MAP FROM path TO ESSelect INSTANCE
    :param wheres: MAP FROM path TO LIST OF WHERE CONDITIONS
    :return: es_query
    """
    es_query = op.zero
    for p in reversed(sorted(set(wheres.keys()) | set(selects.keys()))):
        # DEEPEST TO SHALLOW
        where = wheres.get(p, TRUE)
        select = selects.get(p, Null)

        es_where = op([es_query, where])
        es_query = NestedOp(path=Variable(p), query=es_where, select=select)
    return es_query.partial_eval().to_es(schema)
コード例 #15
0
    def _normalize_select(self, select):
        output = []
        if select.value == ".":
            for cname, cs in self.columns.items():
                for c in cs:
                    if c.type in STRUCT:
                        continue

                    new_select = select.copy()
                    new_select.name = cname
                    new_select.value = Variable(cname)
                    output.append(new_select)
                    break
        elif select.value.endswith(".*"):
            Log.error("not done")
        else:
            Log.error("not done")
        return output
コード例 #16
0
ファイル: expressions.py プロジェクト: yoyogias2011/TUID
def to_es_script(self, schema, many=True):
    if self.var == ".":
        return "_source"
    else:
        if self.var == "_id":
            return EsScript(
                type=STRING,
                expr=
                'doc["_uid"].value.substring(doc["_uid"].value.indexOf(\'#\')+1)',
                frum=self)

        columns = schema.values(self.var)
        acc = []
        for c in columns:
            varname = c.es_column
            frum = Variable(c.es_column)
            q = quote(varname)
            if many:
                acc.append(
                    EsScript(miss=frum.missing(),
                             type=c.jx_type,
                             expr="doc[" + q +
                             "].values" if c.jx_type != BOOLEAN else "doc[" +
                             q + "].value==\"T\"",
                             frum=frum,
                             many=True))
            else:
                acc.append(
                    EsScript(miss=frum.missing(),
                             type=c.jx_type,
                             expr="doc[" + q +
                             "].value" if c.jx_type != BOOLEAN else "doc[" +
                             q + "].value==\"T\"",
                             frum=frum,
                             many=True))

        if len(acc) == 0:
            return NULL.to_es_script(schema)
        elif len(acc) == 1:
            return acc[0]
        else:
            return CoalesceOp("coalesce", acc).to_es_script(schema)
コード例 #17
0
def es_setop(es, query):
    schema = query.frum.schema
    query_path = schema.query_path[0]

    split_select = {".": ESSelect('.')}

    def get_select(path):
        es_select = split_select.get(path)
        if not es_select:
            es_select = split_select[path] = ESSelect(path)
        return es_select

    selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(
                    select.name, relative_field(untype_path(c.name), term.var))
                if c.jx_type == NESTED:
                    get_select('.').use_source = True
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": "."
                        },
                        "pull": get_pull_source(c.es_column)
                    })
                    put_index += 1
                else:
                    get_select(c.nested_path[0]).fields.append(c.es_column)
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": "."
                        }
                    })
                    put_index += 1
        elif is_op(select.value, Variable):
            s_column = select.value.var

            if s_column == ".":
                # PULL ALL SOURCE
                get_select('.').use_source = True
                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                    "pull": get_pull_source(".")
                })
                continue

            leaves = schema.leaves(s_column)  # LEAVES OF OBJECT
            # nested_selects = {}
            if leaves:
                if any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    get_select('.').use_source = True
                    for c in leaves:
                        if len(
                                c.nested_path
                        ) == 1:  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES
                            pre_child = join_field(
                                decode_property(n)
                                for n in split_field(c.name))
                            new_select.append({
                                "name":
                                select.name,
                                "value":
                                Variable(c.es_column),
                                "put": {
                                    "name":
                                    select.name,
                                    "index":
                                    put_index,
                                    "child":
                                    untype_path(
                                        relative_field(pre_child, s_column))
                                },
                                "pull":
                                get_pull_source(c.es_column)
                            })
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        c_nested_path = c.nested_path[0]
                        if c_nested_path == ".":
                            if c.es_column == "_id":
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": "."
                                    },
                                    "pull":
                                    lambda row: row._id
                                })
                            elif c.jx_type == NESTED:
                                get_select('.').use_source = True
                                pre_child = join_field(
                                    decode_property(n)
                                    for n in split_field(c.name))
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name":
                                        select.name,
                                        "index":
                                        put_index,
                                        "child":
                                        untype_path(
                                            relative_field(
                                                pre_child, s_column))
                                    },
                                    "pull":
                                    get_pull_source(c.es_column)
                                })
                            else:
                                get_select(c_nested_path).fields.append(
                                    c.es_column)
                                pre_child = join_field(
                                    decode_property(n)
                                    for n in split_field(c.name))
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name":
                                        select.name,
                                        "index":
                                        put_index,
                                        "child":
                                        untype_path(
                                            relative_field(
                                                pre_child, s_column))
                                    }
                                })
                        else:
                            es_select = get_select(c_nested_path)
                            es_select.fields.append(c.es_column)

                            child = relative_field(
                                untype_path(
                                    relative_field(c.name,
                                                   schema.query_path[0])),
                                s_column)
                            pull = accumulate_nested_doc(
                                c_nested_path,
                                Variable(
                                    relative_field(
                                        s_column, unnest_path(c_nested_path))))
                            new_select.append({
                                "name": select.name,
                                "value": select.value,
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": child
                                },
                                "pull": pull
                            })
            else:
                new_select.append({
                    "name": select.name,
                    "value": Variable("$dummy"),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    }
                })
            put_index += 1
        else:
            split_scripts = split_expression_by_path(select.value,
                                                     schema,
                                                     lang=Painless)
            for p, script in split_scripts.items():
                es_select = get_select(p)
                es_select.scripts[select.name] = {
                    "script":
                    text_type(Painless[first(
                        script)].partial_eval().to_es_script(schema))
                }
                new_select.append({
                    "name":
                    select.name,
                    "pull":
                    jx_expression_to_function("fields." +
                                              literal_field(select.name)),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    }
                })
                put_index += 1

    for n in new_select:
        if n.pull:
            continue
        elif is_op(n.value, Variable):
            if get_select('.').use_source:
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(
                    concat_field("fields", literal_field(n.value.var)))
        else:
            Log.error("Do not know what to do")

    split_wheres = split_expression_by_path(query.where, schema, lang=ES52)
    es_query = es_query_proto(query_path, split_select, split_wheres, schema)
    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)
    es_query.sort = jx_sort_to_es_sort(query.sort, schema)

    with Timer("call to ES", silent=DEBUG) as call_timer:
        data = es_post(es, es_query, query.limit)

    T = data.hits.hits

    # Log.note("{{output}}", output=T)

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        with Timer("formatter", silent=True):
            output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
コード例 #18
0
def split_expression_by_path(expr, schema, lang=Language):
    """
    :param expr: EXPRESSION TO INSPECT
    :param schema: THE SCHEMA
    :param output: THE MAP FROM PATH TO EXPRESSION WE WANT UPDATED
    :param var_to_columns: MAP FROM EACH VARIABLE NAME TO THE DEPTH
    :return: type, output: (OP, MAP) PAIR WHERE OP IS OPERATOR TO APPLY ON MAP ITEMS, AND MAP FROM PATH TO EXPRESSION
    """
    if is_op(expr, AndOp):
        if not expr.terms:
            return AndOp, {".": TRUE}
        elif len(expr.terms) == 1:
            return split_expression_by_path(expr.terms[0], schema, lang=lang)

        output = {}
        curr_op = AndOp
        for w in expr.terms:
            op, split = split_expression_by_path(w, schema, lang=lang)
            if op == AndOp:
                for v, es in split.items():
                    ae = output.get(v)
                    if not ae:
                        output[v] = ae = AndOp([])
                    ae.terms.append(es)
            elif len(output) == 1 and all(c.jx_type == EXISTS
                                          for v in split["."].vars()
                                          for c in schema.values(v.var)):
                for v, es in split.items():
                    if v == ".":
                        continue
                    ae = output.get(v)
                    if not ae:
                        output[v] = ae = AndOp([])
                    ae.terms.append(es)
            else:
                Log.error("confused")
        return curr_op, output

    expr_vars = expr.vars()
    var_to_columns = {v.var: schema.values(v.var) for v in expr_vars}
    all_paths = set(c.nested_path[0] for v in expr_vars
                    for c in var_to_columns[v.var])

    def add(v, c):
        cols = var_to_columns.get(v)
        if not cols:
            var_to_columns[v] = cols = []
        if c not in cols:
            cols.append(c)

    # all_paths MAY BE MISSING SHALLOW PATHS
    exprs = [expr]
    undo = {}
    for p in schema.query_path:
        # CALCULATE THE RESIDUAL EXPRESSION
        # REPLACE EACH DEEPER VAR WITH null
        # TODO: NOT ACCOUNTING FOR DEEP QUERIES ON SHALLOW TABLE
        mapping = {
            v: c
            for v, cols in var_to_columns.items() for c in cols
            if len(c.nested_path[0]) > len(p)
        }
        if mapping:
            acc = []
            for v, col in mapping.items():
                nested_exists = join_field(
                    split_field(col.nested_path[0])[:-1] + [EXISTS_TYPE])
                e = schema.values(nested_exists)
                if not e:
                    Log.error("do not know how to handle")
                add(nested_exists, first(e))  # REGISTER THE EXISTENCE VARIABLE
                acc.append(MissingOp(Variable(nested_exists)))
            acc.append(expr.map({v: NULL for v in mapping.keys()}))
            with_nulls = AndOp(acc).partial_eval()
            if with_nulls is not FALSE:
                all_paths.add(p)
                exprs.append(with_nulls)

    if len(all_paths) == 0:
        return AndOp, {".": expr}  # CONSTANTS
    elif len(all_paths) == 1:
        return AndOp, {first(all_paths): expr}

    # EXPAND EXPRESSION TO ALL REALIZED COLUMNS
    for v, cols in list(var_to_columns.items()):
        for col in cols:
            add(col.es_column, col)
        if len(cols) <= 1:
            continue

        more_expr = []
        for e in exprs:
            for col in cols:
                more_expr.append(e.map({v: col.es_column}))
        exprs = more_expr

    acc = {}
    for e in exprs:
        nestings = list(
            set(c.nested_path[0] for v in e.vars() for c in var_to_columns[v]))
        if not nestings:
            a = acc.get(".")
            if not a:
                acc["."] = a = OrOp([])
            a.terms.append(e)
        elif len(nestings) == 1:
            a = acc.get(nestings[0])
            if not a:
                acc[nestings[0]] = a = OrOp([])
            a.terms.append(e)
        else:
            Log.error("Expression is too complex")

    if undo:
        return OrOp, {k: v.map(undo) for k, v in acc.items()}
    else:
        return OrOp, acc
コード例 #19
0
def query_to_outer_joins(query, all_paths, split_select, var_to_columns):
    """
    CONVERT FROM JSON QUERY EXPRESSION TO A NUMBER OF OUTER JOINS
    :param frum:
    :param expr:
    :param all_paths:
    :param var_to_columns:
    :return:
    """
    def split(expr):
        """
        :param expr: JSON EXPRESSION
        :return: ARRAY INDEX BY (CONCAT, OUTER JOIN, AND)
        """
        expr = expr.partial_eval()

        if is_op(expr, AndOp):
            acc = [tuple([] for _ in all_paths)]
            for t in expr.terms:
                next = []
                for c in split(t):
                    for a in acc:
                        next.append(tuple(n + an for n, an in zip(c, a)))
                acc = next
            return acc
        elif is_op(expr, OrOp):
            output = []
            exclude = []
            for t in expr.terms:
                for c in split(AndOp([AndOp(exclude), t])):
                    output.append(c)
                exclude.append(NotOp(t))
            return output

        all_nests = list(
            set(c.nested_path[0] for v in expr.vars()
                for c in frum.schema.values(v.var)))

        if len(all_nests) > 1:
            Log.error("do not know how to handle")
        elif not all_nests:
            return [tuple([expr] if p == "." else [] for p in all_paths)]
        else:
            return [
                tuple([expr] if p == all_nests[0] else [] for p in all_paths)
            ]

    frum = query.frum
    where = query.where
    focal_path = frum.schema.query_path[0]

    # MAP TO es_columns, INCLUDE NESTED EXISTENCE IN EACH VARIABLE
    wheres = split_nested_inner_variables(where, focal_path, var_to_columns)
    concat_outer_and = split(wheres)

    # ATTACH SELECTS
    output = ConcatOp([])
    for concat in concat_outer_and:
        outer = OuterJoinOp(frum, [])
        for p, nest in zip(all_paths, concat):
            select = coalesce(split_select.get(p), NULL)
            outer.nests.append(
                NestedOp(Variable(p), select=select, where=AndOp(nest)))
        output.terms.append(outer)

    return output.partial_eval()
コード例 #20
0
ファイル: set_op.py プロジェクト: klahnakoski/mo-etl
def get_selects(query):
    schema = query.frum.schema
    split_select = {".": ESSelect(".")}

    def get_select(path):
        es_select = split_select.get(path)
        if not es_select:
            es_select = split_select[path] = ESSelect(path)
        return es_select

    selects = wrap([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()
    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(
                    select.name, relative_field(untype_path(c.name), term.var)
                )
                if c.jx_type == NESTED:
                    get_select(".").set_op = True
                    new_select.append(
                        {
                            "name": full_name,
                            "value": Variable(c.es_column),
                            "put": {
                                "name": literal_field(full_name),
                                "index": put_index,
                                "child": ".",
                            },
                            "pull": get_pull_source(c.es_column),
                        }
                    )
                    put_index += 1
                else:
                    get_select(c.nested_path[0]).fields.append(c.es_column)
                    new_select.append(
                        {
                            "name": full_name,
                            "value": Variable(c.es_column),
                            "put": {
                                "name": literal_field(full_name),
                                "index": put_index,
                                "child": ".",
                            },
                        }
                    )
                    put_index += 1
        elif is_op(select.value, Variable):
            s_column = select.value.var

            if s_column == ".":
                # PULL ALL SOURCE
                get_select(".").set_op = True
                new_select.append(
                    {
                        "name": select.name,
                        "value": select.value,
                        "put": {"name": select.name, "index": put_index, "child": "."},
                        "pull": get_pull_source("."),
                    }
                )
                continue

            leaves = schema.leaves(s_column)  # LEAVES OF OBJECT
            # nested_selects = {}
            if leaves:
                if any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    get_select(".").set_op = True
                    for c in leaves:
                        if (
                            len(c.nested_path) == 1
                        ):  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES
                            pre_child = join_field(
                                decode_property(n) for n in split_field(c.name)
                            )
                            new_select.append(
                                {
                                    "name": select.name,
                                    "value": Variable(c.es_column),
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": untype_path(
                                            relative_field(pre_child, s_column)
                                        ),
                                    },
                                    "pull": get_pull_source(c.es_column),
                                }
                            )
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        c_nested_path = c.nested_path[0]
                        if c_nested_path == ".":
                            if c.es_column == "_id":
                                new_select.append(
                                    {
                                        "name": select.name,
                                        "value": Variable(c.es_column),
                                        "put": {
                                            "name": select.name,
                                            "index": put_index,
                                            "child": ".",
                                        },
                                        "pull": lambda row: row._id,
                                    }
                                )
                            elif c.jx_type == NESTED:
                                get_select(".").set_op = True
                                pre_child = join_field(
                                    decode_property(n) for n in split_field(c.name)
                                )
                                new_select.append(
                                    {
                                        "name": select.name,
                                        "value": Variable(c.es_column),
                                        "put": {
                                            "name": select.name,
                                            "index": put_index,
                                            "child": untype_path(
                                                relative_field(pre_child, s_column)
                                            ),
                                        },
                                        "pull": get_pull_source(c.es_column),
                                    }
                                )
                            else:
                                get_select(c_nested_path).fields.append(c.es_column)
                                pre_child = join_field(
                                    decode_property(n) for n in split_field(c.name)
                                )
                                new_select.append(
                                    {
                                        "name": select.name,
                                        "value": Variable(c.es_column),
                                        "put": {
                                            "name": select.name,
                                            "index": put_index,
                                            "child": untype_path(
                                                relative_field(pre_child, s_column)
                                            ),
                                        },
                                    }
                                )
                        else:
                            es_select = get_select(c_nested_path)
                            es_select.fields.append(c.es_column)

                            child = relative_field(
                                untype_path(
                                    relative_field(c.name, schema.query_path[0])
                                ),
                                s_column,
                            )
                            pull = accumulate_nested_doc(
                                c_nested_path,
                                Variable(
                                    relative_field(s_column, unnest_path(c_nested_path))
                                ),
                            )
                            new_select.append(
                                {
                                    "name": select.name,
                                    "value": select.value,
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": child,
                                    },
                                    "pull": pull,
                                }
                            )
            else:
                new_select.append(
                    {
                        "name": select.name,
                        "value": Variable("$dummy"),
                        "put": {"name": select.name, "index": put_index, "child": "."},
                    }
                )
            put_index += 1
        else:
            split_scripts = split_expression_by_path(
                select.value, schema, lang=Painless
            )
            for p, script in split_scripts.items():
                es_select = get_select(p)
                es_select.scripts[select.name] = {
                    "script": text(
                        Painless[first(script)].partial_eval().to_es_script(schema)
                    )
                }
                new_select.append(
                    {
                        "name": select.name,
                        "pull": jx_expression_to_function(
                            "fields." + literal_field(select.name)
                        ),
                        "put": {"name": select.name, "index": put_index, "child": "."},
                    }
                )
                put_index += 1
    for n in new_select:
        if n.pull:
            continue
        elif is_op(n.value, Variable):
            if get_select(".").set_op:
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(
                    concat_field("fields", literal_field(n.value.var))
                )
        else:
            Log.error("Do not know what to do")
    return new_select, split_select
コード例 #21
0
ファイル: set_op.py プロジェクト: klahnakoski/pyLibrary
def get_selects(query):
    schema = query.frum.schema
    query_level = len(schema.query_path)
    query_path = schema.query_path[0]
    # SPLIT select INTO ES_SELECT AND RESULTSET SELECT
    split_select = OrderedDict((p, ESSelectOp(p)) for p in schema.query_path)

    def expand_split_select(c_nested_path):
        es_select = split_select.get(c_nested_path)
        if not es_select:
            temp = [(k, v) for k, v in split_select.items()]
            split_select.clear()
            split_select.update({c_nested_path: ESSelectOp(c_nested_path)})
            split_select.update(temp)
        return split_select[c_nested_path]

    new_select = FlatList()
    post_expressions = {}

    selects = list_to_data([unwrap(s.copy()) for s in listwrap(query.select)])

    # WHAT PATH IS _source USED, IF ANY?
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            if any(c.jx_type == NESTED for c in leaves):
                split_select["."].source_path = "."
        elif is_op(select.value, Variable):
            for selected_column in schema.values(select.value.var,
                                                 exclude_type=(OBJECT,
                                                               EXISTS)):
                if selected_column.jx_type == NESTED:
                    expand_split_select(
                        selected_column.es_column
                    ).source_path = selected_column.es_column
                    continue
                leaves = schema.leaves(selected_column.es_column)
                for c in leaves:
                    if c.jx_type == NESTED:
                        split_select[c.es_column].source_path = c.es_column

    # IF WE GET THE SOURCE FOR PARENT, WE ASSUME WE GOT SOURCE FOR CHILD
    source_path = None
    source_level = 0
    for level, es_select in enumerate(reversed(list(split_select.values()))):
        if source_path:
            es_select.source_path = source_path
        elif es_select.source_path:
            source_level = level + 1
            source_path = es_select.source_path

    def get_pull_source(c):
        nested_path = c.nested_path
        nested_level = len(nested_path)
        pos = text(nested_level)

        if nested_level <= query_level:
            if not source_level or nested_level < source_level:
                field = join_field([pos, "fields", c.es_column])
                return jx_expression_to_function(field)
            elif nested_level == source_level:
                field = relative_field(c.es_column, nested_path[0])

                def pull_source(row):
                    return untyped(row.get(pos, Null)._source[field])

                return pull_source
            else:
                field = relative_field(c.es_column, nested_path[0])

                def pull_property(row):
                    return untyped(row.get(pos, Null)[field])

                return pull_property
        else:
            pos = text(query_level)

            if not source_level or nested_level < source_level:
                # PULL FIELDS AND THEN AGGREGATE THEM
                value = jx_expression_to_function(
                    join_field(["fields", c.es_column]))
                name = literal_field(nested_path[0])
                index = jx_expression_to_function("_nested.offset")

                def pull_nested_field(doc):
                    hits = doc.get(pos, Null).inner_hits[name].hits.hits
                    if not hits:
                        return []

                    temp = [(index(h), value(h)) for h in hits]
                    acc = [None] * len(temp)
                    for i, v in temp:
                        acc[i] = unwraplist(v)
                    return acc

                return pull_nested_field
            else:
                # PULL SOURCES
                value = jx_expression_to_function(
                    concat_field("_source",
                                 relative_field(c.es_column, nested_path[0])))
                name = literal_field(nested_path[0])
                index = jx_expression_to_function(
                    join_field(["_nested"] * (len(c.nested_path) - 1) +
                               ["offset"]))

                def pull_nested_source(doc):
                    hits = doc.get(pos, Null).inner_hits[name].hits.hits
                    if not hits:
                        return []

                    temp = [(index(h), value(h)) for h in hits]
                    acc = [None] * len(temp)
                    for i, v in temp:
                        acc[i] = untyped(v)
                    return acc

                return pull_nested_source

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                c_nested_path = c.nested_path[0]
                simple_name = relative_field(c.es_column,
                                             query_path).lstrip(".")
                name = concat_field(select.name, untype_path(simple_name))
                put_name = concat_field(
                    select.name, literal_field(untype_path(simple_name)))
                split_select[c_nested_path].fields.append(c.es_column)
                new_select.append({
                    "name": name,
                    "value": Variable(c.es_column),
                    "put": {
                        "name": put_name,
                        "index": put_index,
                        "child": ".",
                    },
                    "pull": get_pull_source(c),
                })
                put_index += 1
        elif is_op(select.value, Variable):
            if select.value.var == ".":
                # PULL ALL SOURCE
                new_select.append({
                    "name":
                    select.name,
                    "value":
                    select.value,
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                    "pull":
                    get_pull_source(
                        Data(es_column=query_path,
                             nested_path=schema.query_path)),
                })
                continue

            for selected_column in schema.values(select.value.var,
                                                 exclude_type=(EXISTS,
                                                               OBJECT)):
                if selected_column.jx_type == NESTED:
                    new_select.append({
                        "name":
                        select.name,
                        "value":
                        select.value,
                        "put": {
                            "name": select.name,
                            "index": put_index,
                            "child": "."
                        },
                        "pull":
                        get_pull_source(
                            Data(
                                es_column=selected_column.es_column,
                                nested_path=(selected_column.es_column, ) +
                                selected_column.nested_path,
                            )),
                    })
                    continue

                leaves = schema.leaves(
                    selected_column.es_column,
                    exclude_type=INTERNAL)  # LEAVES OF OBJECT
                if leaves:
                    for c in leaves:
                        if c.es_column == "_id":
                            new_select.append({
                                "name": select.name,
                                "value": Variable(c.es_column),
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": ".",
                                },
                                "pull": pull_id,
                            })
                            continue
                        c_nested_path = c.nested_path[0]
                        expand_split_select(c_nested_path).fields.append(
                            c.es_column)
                        child = untype_path(
                            relative_field(
                                c.es_column,
                                selected_column.es_column,
                            ))
                        new_select.append({
                            "name": select.name,
                            "value": Variable(c.es_column),
                            "put": {
                                "name": select.name,
                                "index": put_index,
                                "child": child,
                            },
                            "pull": get_pull_source(c),
                        })

                else:
                    new_select.append({
                        "name": select.name,
                        "value": NULL,
                        "put": {
                            "name": select.name,
                            "index": put_index,
                            "child": "."
                        },
                    })
                put_index += 1
        else:
            op, split_scripts = split_expression_by_path(select.value,
                                                         schema,
                                                         lang=Painless)
            for p, script in split_scripts.items():
                es_select = split_select[p]
                es_select.scripts[select.name] = {
                    "script":
                    text(Painless[script].partial_eval().to_es_script(schema))
                }
                new_select.append({
                    "name":
                    select.name,
                    "pull":
                    jx_expression_to_function(
                        join_field([
                            text(p),
                            "fields",
                            select.name,
                        ])),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                })
                put_index += 1

    def inners(query_path, parent_pos):
        """
        :param query_path:
        :return:  ITERATOR OVER TUPLES ROWS AS TUPLES, WHERE  row[len(nested_path)] HAS INNER HITS
                  AND row[0] HAS post_expressions
        """
        pos = text(int(parent_pos) + 1)
        if not query_path:

            def base_case(row):
                extra = {}
                for k, e in post_expressions.items():
                    extra[k] = e(row)
                row["0"] = extra
                yield row

            return base_case

        if pos == "1":
            more = inners(query_path[:-1], "1")

            def first_case(results):
                for result in results:
                    for hit in result.hits.hits:
                        seed = {"0": None, pos: hit}
                        for row in more(seed):
                            yield row

            return first_case

        else:
            more = inners(query_path[:-1], pos)
            if source_path and source_path < query_path[-1]:
                rel_path = relative_field(query_path[-1], source_path)

                def source(acc):
                    for inner_row in acc[parent_pos][rel_path]:
                        acc[pos] = inner_row
                        for tt in more(acc):
                            yield tt

                return source
            else:
                path = literal_field(query_path[-1])

                def recurse(acc):
                    hits = acc[parent_pos].inner_hits[path].hits.hits
                    if hits:
                        for inner_row in hits:
                            acc[pos] = inner_row
                            for tt in more(acc):
                                yield tt
                    else:
                        for tt in more(acc):
                            yield tt

                return recurse

    return new_select, split_select, inners(schema.query_path, "0")