Beispiel #1
0
    def __getitem__(self, item):
        # TODO: SOLVE FUNDAMENTAL QUESTION OF IF SELECTING A PART OF AN
        # EDGE REMOVES THAT EDGE FROM THIS RESULT, OR ADDS THE PART
        # AS A select {"name":edge.name, "value":edge.domain.partitions[coord]}
        # PROBABLY NOT, THE value IS IDENTICAL OVER THE REMAINING
        if is_data(item):
            coordinates = [None] * len(self.edges)

            # MAP DICT TO NUMERIC INDICES
            for name, v in item.items():
                ei, parts = list_to_data([(i, e.domain.partitions)
                                          for i, e in enumerate(self.edges)
                                          if e.name == name])[0]
                if not parts:
                    Log.error(
                        "Can not find {{name}}=={{value|quote}} in list of edges, maybe this feature is not implemented yet",
                        name=name,
                        value=v)
                part = list_to_data([p for p in parts if p.value == v])[0]
                if not part:
                    return Null
                else:
                    coordinates[ei] = part.dataIndex

            edges = [e for e, v in zip(self.edges, coordinates) if v is None]
            if not edges:
                # ZERO DIMENSIONAL VALUE
                return dict_to_data({
                    k: v.__getitem__(coordinates)
                    for k, v in self.data.items()
                })
            else:
                output = Cube(select=self.select,
                              edges=list_to_data([
                                  e for e, v in zip(self.edges, coordinates)
                                  if v is None
                              ]),
                              data={
                                  k: Matrix(values=c.__getitem__(coordinates))
                                  for k, c in self.data.items()
                              })
                return output
        elif is_text(item):
            # RETURN A VALUE CUBE
            if self.is_value:
                if item != self.select.name:
                    Log.error("{{name}} not found in cube", name=item)
                return self

            if item not in self.select.name:
                Log.error("{{name}} not found in cube", name=item)

            output = Cube(select=[s for s in self.select if s.name == item][0],
                          edges=self.edges,
                          data={item: self.data[item]})
            return output
        else:
            Log.error("not implemented yet")
    def __init__(self, select, edges, data, frum=None):
        """
        data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE
        ALLOWED, USING THE select AND edges TO DESCRIBE THE data
        """

        self.is_value = False if is_list(select) else True
        self.select = select
        self.meta = Data(format="cube")       # PUT EXTRA MARKUP HERE
        self.is_none = False

        if not all(data.values()):
            is_none = True

        # ENSURE frum IS PROPER FORM
        if is_list(select):
            if edges and OR(not isinstance(v, Matrix) for v in data.values()):
                Log.error("Expecting data to be a dict with Matrix values")

        if not edges:
            if not data:
                if is_list(select):
                    Log.error("not expecting a list of records")

                data = {select.name: Matrix.ZERO}
                self.edges = Null
            elif is_data(data):
                # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA
                length = MAX([len(v) for v in data.values()])
                if length >= 1:
                    self.edges = list_to_data([{"name": "rownum", "domain": {"type": "rownum"}}])
                else:
                    self.edges = Null
            elif is_list(data):
                if is_list(select):
                    Log.error("not expecting a list of records")

                data = {select.name: Matrix.wrap(data)}
                self.edges = list_to_data([{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}])
            elif isinstance(data, Matrix):
                if is_list(select):
                    Log.error("not expecting a list of records")

                data = {select.name: data}
            else:
                if is_list(select):
                    Log.error("not expecting a list of records")

                data = {select.name: Matrix(value=data)}
                self.edges = Null
        else:
            self.edges = to_data(edges)

        self.data = data
Beispiel #3
0
 def search(self, query):
     query = to_data(query)
     f = jx.get(query.query.filtered.filter)
     filtered = list_to_data([{
         "_id": i,
         "_source": d
     } for i, d in self.data.items() if f(d)])
     if query.fields:
         return dict_to_data({
             "hits": {
                 "total":
                 len(filtered),
                 "hits": [{
                     "_id":
                     d._id,
                     "fields":
                     unwrap(
                         jx.select([unwrap(d._source)], query.fields)[0])
                 } for d in filtered]
             }
         })
     else:
         return dict_to_data(
             {"hits": {
                 "total": len(filtered),
                 "hits": filtered
             }})
Beispiel #4
0
    def __init__(self, **desc):
        Domain.__init__(self, **desc)
        self.type = "time"
        self.NULL = Null
        self.min = Date(self.min)
        self.max = Date(self.max)
        self.interval = Duration(self.interval)
        self.sort = Null

        if self.partitions:
            # IGNORE THE min, max, interval
            if not self.key:
                Log.error("Must have a key value")

            Log.error("not implemented yet")

            # VERIFY PARTITIONS DO NOT OVERLAP
            return

        self.verify_attributes_not_null(["min", "max", "interval"])
        self.key = "min"
        self.partitions = list_to_data([{
            "min": v,
            "max": v + self.interval,
            "dataIndex": i
        } for i, v in enumerate(Date.range(self.min, self.max, self.interval))
                                        ])
Beispiel #5
0
def filter(data, where):
    """
    where  - a function that accepts (record, rownum, rows) and returns boolean
    """
    if len(data) == 0 or where == None or where == TRUE:
        return data

    if isinstance(data, Container):
        return data.filter(where)

    if is_container(data):
        temp = get(where)
        dd = to_data(data)
        return list_to_data(
            [unwrap(d) for i, d in enumerate(data) if temp(to_data(d), i, dd)])
    else:
        Log.error("Do not know how to handle type {{type}}",
                  type=data.__class__.__name__)

    try:
        return drill_filter(where, data)
    except Exception as _:
        # WOW!  THIS IS INEFFICIENT!
        return to_data([
            unwrap(d)
            for d in drill_filter(where, [DataObject(d) for d in data])
        ])
Beispiel #6
0
    def __init__(self, **desc):
        Domain.__init__(self, **desc)
        self.type = "duration"
        self.NULL = Null
        self.min = Duration(self.min)
        self.max = Duration(self.max)
        self.interval = Duration(self.interval)

        if self.partitions:
            # IGNORE THE min, max, interval
            if not self.key:
                Log.error("Must have a key value")

            Log.error("not implemented yet")

            # VERIFY PARTITIONS DO NOT OVERLAP
            return
        elif not all([self.min, self.max, self.interval]):
            Log.error("Can not handle missing parameter")

        self.key = "min"
        self.partitions = list_to_data([{
            "min": v,
            "max": v + self.interval,
            "dataIndex": i
        } for i, v in enumerate(
            Duration.range(self.min, self.max, self.interval))])
Beispiel #7
0
    def _groupby(self, edges):
        """
        RETURNS LIST OF (coord, values) TUPLES, WHERE
            coord IS THE INDEX INTO self CUBE (-1 INDEX FOR COORDINATES NOT GROUPED BY)
            values ALL VALUES THAT BELONG TO THE SLICE

        """
        edges = FlatList([n for e in edges for n in _normalize_edge(e)])

        stacked = [e for e in self.edges if e.name in edges.name]
        remainder = [e for e in self.edges if e.name not in edges.name]
        selector = [1 if e.name in edges.name else 0 for e in self.edges]

        if len(stacked) + len(remainder) != len(self.edges):
            Log.error("can not find some edges to group by")
        # CACHE SOME RESULTS
        keys = edges.name
        getKey = [e.domain.getKey for e in self.edges]
        lookup = [[
            getKey[i](p)
            for p in e.domain.partitions + ([None] if e.allowNulls else [])
        ] for i, e in enumerate(self.edges)]

        def coord2term(coord):
            output = leaves_to_data(
                {keys[i]: lookup[i][c]
                 for i, c in enumerate(coord)})
            return output

        if is_list(self.select):
            selects = listwrap(self.select)
            index, v = transpose(*self.data[selects[0].name].groupby(selector))

            coord = list_to_data([coord2term(c) for c in index])

            values = [v]
            for s in selects[1::]:
                i, v = transpose(*self.data[s.name].group_by(selector))
                values.append(v)

            output = transpose(coord, [
                Cube(self.select, remainder,
                     {s.name: v[i]
                      for i, s in enumerate(selects)}) for v in zip(*values)
            ])
        elif not remainder:
            # v IS A VALUE, NO NEED TO WRAP IT IN A Cube
            output = (
                (coord2term(coord), v)
                for coord, v in self.data[self.select.name].groupby(selector))
        else:
            output = (
                (coord2term(coord), Cube(self.select, remainder, v))
                for coord, v in self.data[self.select.name].groupby(selector))

        return output
Beispiel #8
0
def get_columns(data, leaves=False):
    # TODO Split this into two functions
    if not leaves:
        return list_to_data([{
            "name": n
        } for n in UNION(set(d.keys()) for d in data)])
    else:
        return to_data([{
            "name": leaf
        } for leaf in set(leaf for row in data for leaf, _ in row.leaves())])
Beispiel #9
0
    def done_count(self):
        columns = list(map(text, range(len(self.fields))))
        parts = list_to_data([{text(i): p for i, p in enumerate(part)} for part in set(self.parts)])
        self.parts = None
        sorted_parts = jx.sort(parts, columns)

        self.edge.domain = self.domain = SimpleSetDomain(
            key="value",
            partitions=[{"value": tuple(v[k] for k in columns), "dataIndex": i} for i, v in enumerate(sorted_parts)]
        )
Beispiel #10
0
def es_query_template(path):
    """
    RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE
    :param path: THE NESTED PATH (NOT INCLUDING TABLE NAME)
    :return: (es_query, es_filters) TUPLE
    """

    if not is_text(path):
        Log.error("expecting path to be a string")

    if path != ".":
        f0 = {}
        f1 = {}
        output = dict_to_data({
            "query":
            es_and([
                f0, {
                    "nested": {
                        "path": path,
                        "query": f1,
                        "inner_hits": {
                            "size": 100000
                        }
                    }
                }
            ]),
            "from":
            0,
            "size":
            0,
            "sort": []
        })
        return output, list_to_data([f0, f1])
    else:
        f0 = {}
        output = dict_to_data({
            "query": es_and([f0]),
            "from": 0,
            "size": 0,
            "sort": []
        })
        return output, list_to_data([f0])
Beispiel #11
0
    def groupby(self, edges):
        """
        SLICE THIS CUBE IN TO ONES WITH LESS DIMENSIONALITY
        simple==True WILL HAVE GROUPS BASED ON PARTITION VALUE, NOT PARTITION OBJECTS
        """
        edges = FlatList([n for e in edges for n in _normalize_edge(e)])

        stacked = [e for e in self.edges if e.name in edges.name]
        remainder = [e for e in self.edges if e.name not in edges.name]
        selector = [1 if e.name in edges.name else 0 for e in self.edges]

        if len(stacked) + len(remainder) != len(self.edges):
            Log.error("can not find some edges to group by")
        # CACHE SOME RESULTS
        keys = edges.name
        getKey = [e.domain.getKey for e in self.edges]
        lookup = [[
            getKey[i](p)
            for p in e.domain.partitions + ([None] if e.allowNulls else [])
        ] for i, e in enumerate(self.edges)]

        def coord2term(coord):
            output = leaves_to_data(
                {keys[i]: lookup[i][c]
                 for i, c in enumerate(coord)})
            return output

        if is_list(self.select):
            selects = listwrap(self.select)
            index, v = transpose(*self.data[selects[0].name].groupby(selector))

            coord = list_to_data([coord2term(c) for c in index])

            values = [v]
            for s in selects[1::]:
                i, v = zip(*self.data[s.name].group_by(selector))
                values.append(v)

            output = transpose(coord, [
                Cube(self.select, remainder,
                     {s.name: v[i]
                      for i, s in enumerate(selects)}) for v in zip(*values)
            ])
        elif not remainder:
            # v IS A VALUE, NO NEED TO WRAP IT IN A Cube
            output = (
                (coord2term(coord), v)
                for coord, v in self.data[self.select.name].groupby(selector))
        else:
            output = (
                (coord2term(coord), Cube(self.select, remainder, v))
                for coord, v in self.data[self.select.name].groupby(selector))

        return output
Beispiel #12
0
def _normalize_groupby(groupby, limit, schema=None):
    if groupby == None:
        return None
    output = list_to_data([
        n for e in listwrap(groupby)
        for n in _normalize_group(e, None, limit, schema=schema)
    ])
    for i, o in enumerate(output):
        o.dim = i
    if any(o == None for o in output):
        Log.error("not expected")
    return output
Beispiel #13
0
    def map(self, map_):
        def map_select(s, map_):
            return set_default({"value": s.value.map(map_)}, s)

        def map_edge(e, map_):
            partitions = unwraplist([
                set_default({"where": p.where.map(map_)}, p)
                for p in e.domain.partitions
            ])

            domain = copy(e.domain)
            domain.where = e.domain.where.map(map_)
            domain.partitions = partitions

            edge = copy(e)
            edge.value = e.value.map(map_)
            edge.domain = domain
            if e.range:
                edge.range.min = e.range.min.map(map_)
                edge.range.max = e.range.max.map(map_)
            return edge

        if is_list(self.select):
            select = list_to_data([map_select(s, map_) for s in self.select])
        else:
            select = map_select(self.select, map_)

        return QueryOp(
            frum=self.frum.map(map_),
            select=select,
            edges=list_to_data([map_edge(e, map_) for e in self.edges]),
            groupby=list_to_data([g.map(map_) for g in self.groupby]),
            window=list_to_data([w.map(map_) for w in self.window]),
            where=self.where.map(map_),
            sort=list_to_data(
                [map_select(s, map_) for s in listwrap(self.sort)]),
            limit=self.limit,
            format=self.format,
        )
    def get_meta(self, key, conforming=True):
        """
        RETURN METADATA ON FILE IN BUCKET
        :param key:  KEY, OR PREFIX OF KEY
        :param conforming: TEST IF THE KEY CONFORMS TO REQUIRED PATTERN
        :return: METADATA, IF UNIQUE, ELSE ERROR
        """
        try:
            metas = list(self.bucket.list(prefix=str(key)))
            metas = list_to_data([m for m in metas if text(m.name).find(".json") != -1])

            perfect = Null
            favorite = Null
            too_many = False
            error = None
            for m in metas:
                try:
                    simple = strip_extension(m.key)
                    if conforming:
                        self._verify_key_format(simple)
                    if simple == key:
                        perfect = m
                        too_many = False
                    if simple.startswith(key + ".") or simple.startswith(key + ":"):
                        if favorite and not perfect:
                            too_many = True
                        favorite = m
                except Exception as e:
                    error = e

            if too_many:
                Log.error(
                    "multiple keys in {{bucket}} with prefix={{prefix|quote}}: {{list}}",
                    bucket=self.name,
                    prefix=key,
                    list=[k.name for k in metas],
                )
            if not perfect and error:
                Log.error("Problem with key request", error)
            return coalesce(perfect, favorite)
        except Exception as e:
            Log.error(
                READ_ERROR + " can not read {{key}} from {{bucket}}",
                key=key,
                bucket=self.bucket.name,
                cause=e,
            )
Beispiel #15
0
    def __init__(self, **desc):
        Domain.__init__(self, **desc)
        self.type = "range"
        self.NULL = Null

        if self.partitions:
            # IGNORE THE min, max, interval
            if not self.key:
                Log.error("Must have a key value")

            parts = listwrap(self.partitions)
            for i, p in enumerate(parts):
                self.min = MIN([self.min, p.min])
                self.max = MAX([self.max, p.max])
                if p.dataIndex != None and p.dataIndex != i:
                    Log.error(
                        "Expecting `dataIndex` to agree with the order of the parts"
                    )
                if p[self.key] == None:
                    Log.error(
                        "Expecting all parts to have {{key}} as a property",
                        key=self.key)
                p.dataIndex = i

            # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE
            for p, q in itertools.product(parts, parts):
                if p.min <= q.min and q.min < p.max and unwrap(
                        p) is not unwrap(q):
                    Log.error("partitions overlap!")

            self.partitions = to_data(parts)
            return
        elif any([self.min == None, self.max == None, self.interval == None]):
            Log.error("Can not handle missing parameter")

        self.key = "min"
        self.partitions = list_to_data([{
            "min": v,
            "max": v + self.interval,
            "dataIndex": i
        } for i, v in enumerate(frange(self.min, self.max, self.interval))])
Beispiel #16
0
def drill_filter(esfilter, data):
    """
    PARTIAL EVALUATE THE FILTER BASED ON DATA GIVEN

    TODO:  FIX THIS MONUMENTALLY BAD IDEA
    """
    esfilter = unwrap(esfilter)
    primary_nested = []  # track if nested, changes if not
    primary_column = []  # only one path allowed
    primary_branch = (
        []
    )  # CONTAINS LISTS OF RECORDS TO ITERATE: constantly changing as we dfs the tree

    def parse_field(fieldname, data, depth):
        """
        RETURN (first, rest) OF fieldname
        """
        col = split_field(fieldname)
        d = data
        for i, c in enumerate(col):
            try:
                d = d[c]
            except Exception as e:
                Log.error("{{name}} does not exist", name=fieldname)
            if is_list(d) and len(col) > 1:
                if len(primary_column) <= depth + i:
                    primary_nested.append(True)
                    primary_column.append(c)
                    primary_branch.append(d)
                elif primary_nested[depth] and primary_column[depth + i] != c:
                    Log.error("only one branch of tree allowed")
                else:
                    primary_nested[depth + i] = True
                    primary_column[depth + i] = c
                    primary_branch[depth + i] = d

                return c, join_field(col[i + 1:])
            else:
                if len(primary_column) <= depth + i:
                    primary_nested.append(False)
                    primary_column.append(c)
                    primary_branch.append([d])
        return fieldname, None

    def pe_filter(filter, data, depth):
        """
        PARTIAL EVALUATE THE filter BASED ON data GIVEN
        """
        if filter is TRUE:
            return True
        if filter is FALSE:
            return False

        filter = to_data(filter)

        if filter["and"]:
            result = True
            output = FlatList()
            for a in filter["and"]:
                f = pe_filter(a, data, depth)
                if f is False:
                    result = False
                elif f is not True:
                    output.append(f)
            if result and output:
                return {"and": output}
            else:
                return result
        elif filter["or"]:
            output = FlatList()
            for o in filter["or"]:
                f = pe_filter(o, data, depth)
                if f is True:
                    return True
                elif f is not False:
                    output.append(f)
            if output:
                return {"or": output}
            else:
                return False
        elif filter["not"]:
            f = pe_filter(filter["not"], data, depth)
            if f is True:
                return False
            elif f is False:
                return True
            else:
                return {"not": f}
        elif filter.term or filter.eq:
            eq = coalesce(filter.term, filter.eq)
            result = True
            output = {}
            for col, val in eq.items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d != val:
                        result = False
                else:
                    output[rest] = val

            if result and output:
                return {"term": output}
            else:
                return result
        elif filter.equal:
            a, b = filter["equal"]
            first_a, rest_a = parse_field(a, data, depth)
            first_b, rest_b = parse_field(b, data, depth)
            val_a = data[first_a]
            val_b = data[first_b]
            if not rest_a:
                if not rest_b:
                    if val_a != val_b:
                        return False
                    else:
                        return True
                else:
                    return {"term": {rest_b: val_a}}
            else:
                if not rest_b:
                    return {"term": {rest_a: val_b}}
                else:
                    return {"equal": [rest_a, rest_b]}

        elif filter.terms:
            result = True
            output = {}
            for col, vals in filter["terms"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d not in vals:
                        result = False
                else:
                    output[rest] = vals
            if result and output:
                return {"terms": output}
            else:
                return result

        elif filter.range:
            result = True
            output = {}
            for col, ranges in filter["range"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    for sign, val in ranges.items():
                        if sign in ("gt", ">") and d <= val:
                            result = False
                        if sign == "gte" and d < val:
                            result = False
                        if sign == "lte" and d > val:
                            result = False
                        if sign == "lt" and d >= val:
                            result = False
                else:
                    output[rest] = ranges
            if result and output:
                return {"range": output}
            else:
                return result
        elif filter.missing:
            if is_text(filter.missing):
                field = filter["missing"]
            else:
                field = filter["missing"]["field"]

            first, rest = parse_field(field, data, depth)
            d = data[first]
            if not rest:
                if d == None:
                    return True
                return False
            else:
                return {"missing": rest}
        elif filter.prefix:
            result = True
            output = {}
            for col, val in filter["prefix"].items():
                first, rest = parse_field(col, data, depth)
                d = data[first]
                if not rest:
                    if d == None or not d.startswith(val):
                        result = False
                else:
                    output[rest] = val
            if result and output:
                return {"prefix": output}
            else:
                return result

        elif filter.exists:
            if is_text(filter["exists"]):
                field = filter["exists"]
            else:
                field = filter["exists"]["field"]

            first, rest = parse_field(field, data, depth)
            d = data[first]
            if not rest:
                if d != None:
                    return True
                return False
            else:
                return {"exists": rest}
        else:
            Log.error("Can not interpret esfilter: {{esfilter}}",
                      {"esfilter": filter})

    output = []  # A LIST OF OBJECTS MAKING THROUGH THE FILTER

    def main(sequence, esfilter, row, depth):
        """
        RETURN A SEQUENCE OF REFERENCES OF OBJECTS DOWN THE TREE
        SHORT SEQUENCES MEANS ALL NESTED OBJECTS ARE INCLUDED
        """
        new_filter = pe_filter(esfilter, row, depth)
        if new_filter is True:
            seq = list(sequence)
            seq.append(row)
            output.append(seq)
            return
        elif new_filter is False:
            return

        seq = list(sequence)
        seq.append(row)
        for d in primary_branch[depth]:
            main(seq, new_filter, d, depth + 1)

    # OUTPUT
    for i, d in enumerate(data):
        if is_data(d):
            main([], esfilter, to_data(d), 0)
        else:
            Log.error("filter is expecting a dict, not {{type}}",
                      type=d.__class__)

    # AT THIS POINT THE primary_column[] IS DETERMINED
    # USE IT TO EXPAND output TO ALL NESTED OBJECTS
    max = 0  # EVEN THOUGH A ROW CAN HAVE MANY VALUES, WE ONLY NEED UP TO max
    for i, n in enumerate(primary_nested):
        if n:
            max = i + 1

    # OUTPUT IS A LIST OF ROWS,
    # WHERE EACH ROW IS A LIST OF VALUES SEEN DURING A WALK DOWN A PATH IN THE HIERARCHY
    uniform_output = FlatList()

    def recurse(row, depth):
        if depth == max:
            uniform_output.append(row)
        else:
            nested = row[-1][primary_column[depth]]
            if not nested:
                # PASSED FILTER, BUT NO CHILDREN, SO ADD NULL CHILDREN
                for i in range(depth, max):
                    row.append(None)
                uniform_output.append(row)
            else:
                for d in nested:
                    r = list(row)
                    r.append(d)
                    recurse(r, depth + 1)

    for o in output:
        recurse(o, 0)

    if not max:
        # SIMPLE LIST AS RESULT
        return list_to_data([unwrap(u[0]) for u in uniform_output])

    return PartFlatList(primary_column[0:max], uniform_output)
Beispiel #17
0
def get_selects(query):
    schema = query.frum.schema
    query_level = len(schema.query_path)
    query_path = schema.query_path[0]
    # SPLIT select INTO ES_SELECT AND RESULTSET SELECT
    split_select = OrderedDict((p, ESSelectOp(p)) for p in schema.query_path)

    def expand_split_select(c_nested_path):
        es_select = split_select.get(c_nested_path)
        if not es_select:
            temp = [(k, v) for k, v in split_select.items()]
            split_select.clear()
            split_select.update({c_nested_path: ESSelectOp(c_nested_path)})
            split_select.update(temp)
        return split_select[c_nested_path]

    new_select = FlatList()
    post_expressions = {}

    selects = list_to_data([unwrap(s.copy()) for s in listwrap(query.select)])

    # WHAT PATH IS _source USED, IF ANY?
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            if any(c.jx_type == NESTED for c in leaves):
                split_select["."].source_path = "."
        elif is_op(select.value, Variable):
            for selected_column in schema.values(select.value.var,
                                                 exclude_type=(OBJECT,
                                                               EXISTS)):
                if selected_column.jx_type == NESTED:
                    expand_split_select(
                        selected_column.es_column
                    ).source_path = selected_column.es_column
                    continue
                leaves = schema.leaves(selected_column.es_column)
                for c in leaves:
                    if c.jx_type == NESTED:
                        split_select[c.es_column].source_path = c.es_column

    # IF WE GET THE SOURCE FOR PARENT, WE ASSUME WE GOT SOURCE FOR CHILD
    source_path = None
    source_level = 0
    for level, es_select in enumerate(reversed(list(split_select.values()))):
        if source_path:
            es_select.source_path = source_path
        elif es_select.source_path:
            source_level = level + 1
            source_path = es_select.source_path

    def get_pull_source(c):
        nested_path = c.nested_path
        nested_level = len(nested_path)
        pos = text(nested_level)

        if nested_level <= query_level:
            if not source_level or nested_level < source_level:
                field = join_field([pos, "fields", c.es_column])
                return jx_expression_to_function(field)
            elif nested_level == source_level:
                field = relative_field(c.es_column, nested_path[0])

                def pull_source(row):
                    return untyped(row.get(pos, Null)._source[field])

                return pull_source
            else:
                field = relative_field(c.es_column, nested_path[0])

                def pull_property(row):
                    return untyped(row.get(pos, Null)[field])

                return pull_property
        else:
            pos = text(query_level)

            if not source_level or nested_level < source_level:
                # PULL FIELDS AND THEN AGGREGATE THEM
                value = jx_expression_to_function(
                    join_field(["fields", c.es_column]))
                name = literal_field(nested_path[0])
                index = jx_expression_to_function("_nested.offset")

                def pull_nested_field(doc):
                    hits = doc.get(pos, Null).inner_hits[name].hits.hits
                    if not hits:
                        return []

                    temp = [(index(h), value(h)) for h in hits]
                    acc = [None] * len(temp)
                    for i, v in temp:
                        acc[i] = unwraplist(v)
                    return acc

                return pull_nested_field
            else:
                # PULL SOURCES
                value = jx_expression_to_function(
                    concat_field("_source",
                                 relative_field(c.es_column, nested_path[0])))
                name = literal_field(nested_path[0])
                index = jx_expression_to_function(
                    join_field(["_nested"] * (len(c.nested_path) - 1) +
                               ["offset"]))

                def pull_nested_source(doc):
                    hits = doc.get(pos, Null).inner_hits[name].hits.hits
                    if not hits:
                        return []

                    temp = [(index(h), value(h)) for h in hits]
                    acc = [None] * len(temp)
                    for i, v in temp:
                        acc[i] = untyped(v)
                    return acc

                return pull_nested_source

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                c_nested_path = c.nested_path[0]
                simple_name = relative_field(c.es_column,
                                             query_path).lstrip(".")
                name = concat_field(select.name, untype_path(simple_name))
                put_name = concat_field(
                    select.name, literal_field(untype_path(simple_name)))
                split_select[c_nested_path].fields.append(c.es_column)
                new_select.append({
                    "name": name,
                    "value": Variable(c.es_column),
                    "put": {
                        "name": put_name,
                        "index": put_index,
                        "child": ".",
                    },
                    "pull": get_pull_source(c),
                })
                put_index += 1
        elif is_op(select.value, Variable):
            if select.value.var == ".":
                # PULL ALL SOURCE
                new_select.append({
                    "name":
                    select.name,
                    "value":
                    select.value,
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                    "pull":
                    get_pull_source(
                        Data(es_column=query_path,
                             nested_path=schema.query_path)),
                })
                continue

            for selected_column in schema.values(select.value.var,
                                                 exclude_type=(EXISTS,
                                                               OBJECT)):
                if selected_column.jx_type == NESTED:
                    new_select.append({
                        "name":
                        select.name,
                        "value":
                        select.value,
                        "put": {
                            "name": select.name,
                            "index": put_index,
                            "child": "."
                        },
                        "pull":
                        get_pull_source(
                            Data(
                                es_column=selected_column.es_column,
                                nested_path=(selected_column.es_column, ) +
                                selected_column.nested_path,
                            )),
                    })
                    continue

                leaves = schema.leaves(
                    selected_column.es_column,
                    exclude_type=INTERNAL)  # LEAVES OF OBJECT
                if leaves:
                    for c in leaves:
                        if c.es_column == "_id":
                            new_select.append({
                                "name": select.name,
                                "value": Variable(c.es_column),
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": ".",
                                },
                                "pull": pull_id,
                            })
                            continue
                        c_nested_path = c.nested_path[0]
                        expand_split_select(c_nested_path).fields.append(
                            c.es_column)
                        child = untype_path(
                            relative_field(
                                c.es_column,
                                selected_column.es_column,
                            ))
                        new_select.append({
                            "name": select.name,
                            "value": Variable(c.es_column),
                            "put": {
                                "name": select.name,
                                "index": put_index,
                                "child": child,
                            },
                            "pull": get_pull_source(c),
                        })

                else:
                    new_select.append({
                        "name": select.name,
                        "value": NULL,
                        "put": {
                            "name": select.name,
                            "index": put_index,
                            "child": "."
                        },
                    })
                put_index += 1
        else:
            op, split_scripts = split_expression_by_path(select.value,
                                                         schema,
                                                         lang=Painless)
            for p, script in split_scripts.items():
                es_select = split_select[p]
                es_select.scripts[select.name] = {
                    "script":
                    text(Painless[script].partial_eval().to_es_script(schema))
                }
                new_select.append({
                    "name":
                    select.name,
                    "pull":
                    jx_expression_to_function(
                        join_field([
                            text(p),
                            "fields",
                            select.name,
                        ])),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                })
                put_index += 1

    def inners(query_path, parent_pos):
        """
        :param query_path:
        :return:  ITERATOR OVER TUPLES ROWS AS TUPLES, WHERE  row[len(nested_path)] HAS INNER HITS
                  AND row[0] HAS post_expressions
        """
        pos = text(int(parent_pos) + 1)
        if not query_path:

            def base_case(row):
                extra = {}
                for k, e in post_expressions.items():
                    extra[k] = e(row)
                row["0"] = extra
                yield row

            return base_case

        if pos == "1":
            more = inners(query_path[:-1], "1")

            def first_case(results):
                for result in results:
                    for hit in result.hits.hits:
                        seed = {"0": None, pos: hit}
                        for row in more(seed):
                            yield row

            return first_case

        else:
            more = inners(query_path[:-1], pos)
            if source_path and source_path < query_path[-1]:
                rel_path = relative_field(query_path[-1], source_path)

                def source(acc):
                    for inner_row in acc[parent_pos][rel_path]:
                        acc[pos] = inner_row
                        for tt in more(acc):
                            yield tt

                return source
            else:
                path = literal_field(query_path[-1])

                def recurse(acc):
                    hits = acc[parent_pos].inner_hits[path].hits.hits
                    if hits:
                        for inner_row in hits:
                            acc[pos] = inner_row
                            for tt in more(acc):
                                yield tt
                    else:
                        for tt in more(acc):
                            yield tt

                return recurse

    return new_select, split_select, inners(schema.query_path, "0")
Beispiel #18
0
def get_selects(query):
    schema = query.frum.schema
    split_select = {".": ESSelectOp(".")}

    def get_select(path):
        es_select = split_select.get(path)
        if not es_select:
            es_select = split_select[path] = ESSelectOp(path)
        return es_select

    selects = list_to_data([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()
    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                full_name = concat_field(
                    select.name, relative_field(untype_path(c.name), term.var))
                if c.jx_type == NESTED:
                    get_select(".").get_source = True
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": ".",
                        },
                        "pull": get_pull_source(c.es_column),
                    })
                    put_index += 1
                else:
                    get_select(c.nested_path[0]).fields.append(c.es_column)
                    new_select.append({
                        "name": full_name,
                        "value": Variable(c.es_column),
                        "put": {
                            "name": literal_field(full_name),
                            "index": put_index,
                            "child": ".",
                        },
                    })
                    put_index += 1
        elif is_op(select.value, Variable):
            s_column = select.value.var

            if s_column == ".":
                # PULL ALL SOURCE
                get_select(".").get_source = True
                new_select.append({
                    "name": select.name,
                    "value": select.value,
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                    "pull": get_pull_source("."),
                })
                continue

            leaves = schema.leaves(s_column)  # LEAVES OF OBJECT
            # nested_selects = {}
            if leaves:
                if any(c.jx_type == NESTED for c in leaves):
                    # PULL WHOLE NESTED ARRAYS
                    get_select(".").get_source = True
                    for c in leaves:
                        if len(
                                c.nested_path
                        ) == 1:  # NESTED PROPERTIES ARE IGNORED, CAPTURED BY THESE FIRST LEVEL PROPERTIES
                            pre_child = join_field(
                                decode_property(n)
                                for n in split_field(c.name))
                            new_select.append({
                                "name":
                                select.name,
                                "value":
                                Variable(c.es_column),
                                "put": {
                                    "name":
                                    select.name,
                                    "index":
                                    put_index,
                                    "child":
                                    untype_path(
                                        relative_field(pre_child, s_column)),
                                },
                                "pull":
                                get_pull_source(c.es_column),
                            })
                else:
                    # PULL ONLY WHAT'S NEEDED
                    for c in leaves:
                        c_nested_path = c.nested_path[0]
                        if c_nested_path == ".":
                            if c.es_column == "_id":
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name": select.name,
                                        "index": put_index,
                                        "child": ".",
                                    },
                                    "pull":
                                    lambda row: row._id,
                                })
                            elif c.jx_type == NESTED:
                                get_select(".").get_source = True
                                pre_child = join_field(
                                    decode_property(n)
                                    for n in split_field(c.name))
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name":
                                        select.name,
                                        "index":
                                        put_index,
                                        "child":
                                        untype_path(
                                            relative_field(
                                                pre_child, s_column)),
                                    },
                                    "pull":
                                    get_pull_source(c.es_column),
                                })
                            else:
                                get_select(c_nested_path).fields.append(
                                    c.es_column)
                                pre_child = join_field(
                                    decode_property(n)
                                    for n in split_field(c.name))
                                new_select.append({
                                    "name":
                                    select.name,
                                    "value":
                                    Variable(c.es_column),
                                    "put": {
                                        "name":
                                        select.name,
                                        "index":
                                        put_index,
                                        "child":
                                        untype_path(
                                            relative_field(
                                                pre_child, s_column)),
                                    },
                                })
                        else:
                            es_select = get_select(c_nested_path)
                            es_select.fields.append(c.es_column)

                            child = relative_field(
                                untype_path(
                                    relative_field(c.name,
                                                   schema.query_path[0])),
                                s_column,
                            )
                            pull = accumulate_nested_doc(
                                c_nested_path,
                                Variable(
                                    relative_field(
                                        s_column, unnest_path(c_nested_path))),
                            )
                            new_select.append({
                                "name": select.name,
                                "value": select.value,
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": child,
                                },
                                "pull": pull,
                            })
            else:
                new_select.append({
                    "name": select.name,
                    "value": Variable("$dummy"),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                })
            put_index += 1
        else:
            op, split_scripts = split_expression_by_path(select.value,
                                                         schema,
                                                         lang=Painless)
            for p, script in split_scripts.items():
                es_select = get_select(p)
                es_select.scripts[select.name] = {
                    "script":
                    text(Painless[script].partial_eval().to_es_script(schema))
                }
                new_select.append({
                    "name":
                    select.name,
                    "pull":
                    jx_expression_to_function("fields." +
                                              literal_field(select.name)),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                })
                put_index += 1
    for n in new_select:
        if n.pull:
            continue
        elif is_op(n.value, Variable):
            if get_select(".").get_source:
                n.pull = get_pull_source(n.value.var)
            elif n.value == "_id":
                n.pull = jx_expression_to_function("_id")
            else:
                n.pull = jx_expression_to_function(
                    concat_field("fields", literal_field(n.value.var)))
        else:
            Log.error("Do not know what to do")
    return new_select, split_select
Beispiel #19
0
def _normalize_group(edge, dim_index, limit, schema=None):
    """
    :param edge: Not normalized groupby
    :param dim_index: Dimensions are ordered; this is this groupby's index into that order
    :param schema: for context
    :return: a normalized groupby
    """
    if is_text(edge):
        if edge.endswith(".*"):
            prefix = edge[:-2]
            if schema:
                output = list_to_data([
                    {  # BECASUE THIS IS A GROUPBY, EARLY SPLIT INTO LEAVES WORKS JUST FINE
                        "name":
                        concat_field(
                            prefix,
                            literal_field(
                                relative_field(untype_path(c.name), prefix))),
                        "put": {
                            "name": literal_field(untype_path(c.name))
                        },
                        "value":
                        jx_expression(c.es_column, schema=schema),
                        "allowNulls":
                        True,
                        "domain": {
                            "type": "default"
                        }
                    } for c in schema.leaves(prefix)
                ])
                return output
            else:
                return list_to_data([{
                    "name": untype_path(prefix),
                    "put": {
                        "name": literal_field(untype_path(prefix))
                    },
                    "value": LeavesOp(Variable(prefix)),
                    "allowNulls": True,
                    "dim": dim_index,
                    "domain": {
                        "type": "default"
                    }
                }])

        return list_to_data([{
            "name": edge,
            "value": jx_expression(edge, schema=schema),
            "allowNulls": True,
            "dim": dim_index,
            "domain": Domain(type="default", limit=limit)
        }])
    else:
        edge = to_data(edge)
        if (edge.domain and edge.domain.type != "default"):
            Log.error("groupby does not accept complicated domains")

        if not edge.name and not is_text(edge.value):
            Log.error("You must name compound edges: {{edge}}", edge=edge)

        return list_to_data([{
            "name": coalesce(edge.name, edge.value),
            "value": jx_expression(edge.value, schema=schema),
            "allowNulls": True,
            "dim": dim_index,
            "domain": {
                "type": "default"
            }
        }])
Beispiel #20
0
def _normalize_edges(edges, limit, schema=None):
    return list_to_data([
        n for ie, e in enumerate(listwrap(edges))
        for n in _normalize_edge(e, ie, limit=limit, schema=schema)
    ])
def es_deepop(es, query):
    schema = query.frum.schema
    query_path = schema.query_path[0]

    # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions
    # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER:  ES ALLOWS
    # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU
    # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT
    post_expressions = {}
    es_query, es_filters = es_query_template(query_path)

    # SPLIT WHERE CLAUSE BY DEPTH
    wheres = split_expression_by_depth(query.where, schema)
    for f, w in zip_longest(es_filters, wheres):
        script = ES52[AndOp(w).partial_eval()].to_es(schema)
        set_default(f, script)

    if not wheres[1]:
        # INCLUDE DOCS WITH NO NESTED DOCS
        more_filter = {
            "bool": {
                "filter": [AndOp(wheres[0]).partial_eval().to_es(schema)],
                "must_not": {
                    "nested": {
                        "path": query_path,
                        "query": MATCH_ALL
                    }
                }
            }
        }
    else:
        more_filter = None

    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)

    # es_query.sort = jx_sort_to_es_sort(query.sort)
    map_to_es_columns = schema.map_to_es()
    # {c.name: c.es_column for c in schema.leaves(".")}
    query_for_es = query.map(map_to_es_columns)
    es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema)

    es_query.stored_fields = []

    is_list = is_list_(query.select)
    selects = list_to_data([unwrap(s.copy()) for s in listwrap(query.select)])
    new_select = FlatList()

    put_index = 0
    for select in selects:
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
            leaves = schema.leaves(select.value.term.var)
            col_names = set()
            for c in leaves:
                if c.nested_path[0] == ".":
                    if c.jx_type in INTERNAL:
                        continue
                    es_query.stored_fields += [c.es_column]
                c_name = untype_path(relative_field(c.name, query_path))
                col_names.add(c_name)
                new_select.append({
                    "name": concat_field(select.name, c_name),
                    "nested_path": c.nested_path[0],
                    "put": {
                        "name": concat_field(select.name,
                                             literal_field(c_name)),
                        "index": put_index,
                        "child": "."
                    },
                    "pull": get_pull_function(c)
                })
                put_index += 1

            # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS
            for n in new_select:
                if n.name.startswith("..") and n.name.lstrip(
                        ".") not in col_names:
                    n.put.name = n.name = n.name.lstrip(".")
                    col_names.add(n.name)
        elif is_op(select.value, Variable):
            net_columns = schema.leaves(select.value.var)
            if not net_columns:
                new_select.append({
                    "name": select.name,
                    "nested_path": ".",
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                    "pull": NULL
                })
            else:
                for n in net_columns:
                    if n.nested_path[0] == ".":
                        if n.jx_type == NESTED:
                            continue
                        es_query.stored_fields += [n.es_column]

                    if len(n.nested_path[0]) > len(query_path):
                        # SELECTING INNER PROPERTIES IS NOT ALLOWED
                        continue
                    # WE MUST FIGURE OUT WHICH NAMESPACE s.value.var IS USING SO WE CAN EXTRACT THE child
                    for np in n.nested_path:
                        c_name = untype_path(relative_field(n.name, np))
                        if startswith_field(c_name, select.value.var):
                            child = relative_field(c_name, select.value.var)
                            break
                    else:
                        raise Log.error("Not expected")

                    pull = get_pull_function(n)
                    new_select.append({
                        "name": select.name,
                        "pull": pull,
                        "nested_path": n.nested_path[0],
                        "put": {
                            "name": select.name,
                            "index": put_index,
                            "child": child
                        }
                    })
            put_index += 1
        else:
            expr = select.value
            for v in expr.vars():
                for c in schema[v.var]:
                    if c.nested_path[0] == ".":
                        es_query.stored_fields += [c.es_column]
                    # else:
                    #     Log.error("deep field not expected")

            pull_name = EXPRESSION_PREFIX + select.name
            map_to_local = MapToLocal(schema)
            pull = jx_expression_to_function(pull_name)
            post_expressions[pull_name] = jx_expression_to_function(
                expr.map(map_to_local))

            new_select.append({
                "name": select.name if is_list else ".",
                "pull": pull,
                "value": expr.__data__(),
                "put": {
                    "name": select.name,
                    "index": put_index,
                    "child": "."
                }
            })
            put_index += 1

    # <COMPLICATED> ES needs two calls to get all documents
    more = []

    def get_more(please_stop):
        more.append(
            es.search(
                Data(query=more_filter, stored_fields=es_query.stored_fields)))

    if more_filter:
        need_more = Thread.run("get more", target=get_more)

    with Timer("call to ES") as call_timer:
        data = es.search(es_query)

    # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED
    def inners():
        for t in data.hits.hits:
            for i in t.inner_hits[literal_field(query_path)].hits.hits:
                t._inner = i._source
                for k, e in post_expressions.items():
                    t[k] = e(t)
                yield t
        if more_filter:
            Thread.join(need_more)
            for t in more[0].hits.hits:
                yield t

    # </COMPLICATED>

    try:
        formatter, row_formatter, mime_type = set_formatters[query.format]

        output = formatter(inners(), new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
Beispiel #22
0
    {"a": "x", "t": Date("today-2day").unix, "v": 3},
    {"a": "x", "t": Date("today-3day").unix, "v": 5},
    {"a": "x", "t": Date("today-4day").unix, "v": 7},
    {"a": "x", "t": Date("today-5day").unix, "v": 11},
    {"a": "x", "t": null, "v": 27},
    {"a": "y", "t": Date("today-day").unix, "v": 13},
    {"a": "y", "t": Date("today-2day").unix, "v": 17},
    {"a": "y", "t": Date("today-4day").unix, "v": 19},
    {"a": "y", "t": Date("today-5day").unix, "v": 23}
]

expected_list_1 = list_to_data([
    {"t": (TODAY - WEEK).unix, "v": NULL},
    {"t": (TODAY - 6 * DAY).unix, "v": NULL},
    {"t": (TODAY - 5 * DAY).unix, "v": 34},
    {"t": (TODAY - 4 * DAY).unix, "v": 26},
    {"t": (TODAY - 3 * DAY).unix, "v": 5},
    {"t": (TODAY - 2 * DAY).unix, "v": 20},
    {"t": (TODAY - 1 * DAY).unix, "v": 15},
    {"v": 29}
])

expected2 = list_to_data([
    {"a": "x", "t": (TODAY - WEEK).unix,    "v": NULL},
    {"a": "x", "t": (TODAY - 6 * DAY).unix, "v": NULL},
    {"a": "x", "t": (TODAY - 5 * DAY).unix, "v": 11},
    {"a": "x", "t": (TODAY - 4 * DAY).unix, "v": 7},
    {"a": "x", "t": (TODAY - 3 * DAY).unix, "v": 5},
    {"a": "x", "t": (TODAY - 2 * DAY).unix, "v": 3},
    {"a": "x", "t": (TODAY - 1 * DAY).unix, "v": 2},
    {"a": "x",                              "v": 29},
def table2list(
        column_names,  # tuple of columns names
        rows  # list of tuples
):
    return list_to_data([dict(zip(column_names, r)) for r in rows])
Beispiel #24
0
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Contact: Kyle Lahnakoski ([email protected])
#

from __future__ import absolute_import, division, unicode_literals

from jx_base.expressions import NULL
from mo_dots import list_to_data
from tests.test_jx import BaseTestCase, TEST_TABLE

lots_of_data = list_to_data([{"a": i} for i in range(30)])


class TestFilters(BaseTestCase):
    def test_where_expression(self):
        test = {
            "data": [  # PROPERTIES STARTING WITH _ ARE NESTED AUTOMATICALLY
                {
                    "a": {
                        "b": 0,
                        "c": 0
                    }
                },
                {
                    "a": {
                        "b": 0,