def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if isinstance(field_name, Mapping) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING field_name = field_name["value"] # SIMPLE PYTHON ITERABLE ASSUMED if isinstance(field_name, basestring): if len(split_field(field_name)) == 1: return [(d[field_name], ) for d in data] else: path = split_field(field_name) output = [] flat_list._tuple1(data, path, 0, output) return output elif isinstance(field_name, list): paths = [_select_a_field(f) for f in field_name] output = DictList() _tuple((), unwrap(data), paths, 0, output) return output else: paths = [_select_a_field(field_name)] output = DictList() _tuple((), data, paths, 0, output) return output
def right(self, num=None): """ WITH SLICES BEING FLAT, WE NEED A SIMPLE WAY TO SLICE FROM THE RIGHT """ self._convert() if num == None: return DictList([self.list[-1]]) if num <= 0: return Null return DictList(self.list[-num])
def not_right(self, num): """ WITH SLICES BEING FLAT, WE NEED A SIMPLE WAY TO SLICE FROM THE LEFT [:-num:] """ self._convert() if num == None: return DictList([self.list[:-1:]]) if num <= 0: return Null return DictList(self.list[:-num:])
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ return list of (keys, values) pairs where group by the set of keys values IS LIST OF ALL data that has those keys contiguous - MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES """ if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) if isinstance(data, Cube): return data.groupby(keys) keys = listwrap(keys) def get_keys(d): output = Dict() for k in keys: output[k] = d[k] return output if contiguous: try: if not data: return wrap([]) agg = DictList() acc = DictList() curr_key = value2key(keys, data[0]) for d in data: key = value2key(keys, d) if key != curr_key: agg.append((get_keys(acc[0]), acc)) curr_key = key acc = [d] else: acc.append(d) agg.append((get_keys(acc[0]), acc)) return wrap(agg) except Exception, e: Log.error("Problem grouping contiguous values", e)
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if sort == None: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring): output.append({"value": jx_expression(s), "sort": 1}) elif isinstance(s, Expression): output.append({"value": s, "sort": 1}) elif Math.is_integer(s): output.append({"value": OffsetOp("offset", s), "sort": 1}) elif all(d in sort_direction for d in s.values()) and not s.sort and not s.value: for v, d in s.items(): output.append({"value": jx_expression(v), "sort": -1}) else: output.append({ "value": jx_expression(coalesce(s.value, s.field)), "sort": coalesce(sort_direction[s.sort], 1) }) return output
def es_fieldop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(qb_expression_to_esfilter(query.where)) } } es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = qb_sort_to_es_sort(query.sort) es_query.fields = DictList() source = "fields" for s in select.value: if s == "*": es_query.fields = None source = "_source" elif s == ".": es_query.fields = None source = "_source" elif isinstance(s, basestring) and is_keyword(s): es_query.fields.append(s) elif isinstance(s, list) and es_query.fields is not None: es_query.fields.extend(s) elif isinstance(s, Mapping) and es_query.fields is not None: es_query.fields.extend(s.values()) elif es_query.fields is not None: es_query.fields.append(s) es_query.sort = [{ s.field: "asc" if s.sort >= 0 else "desc" } for s in query.sort] return extract_rows(es, es_query, source, select, query)
def es_setop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.fields = DictList() es_query.sort = qb_sort_to_es_sort(query.sort) source = "fields" for s in select: if s.value == "*": es_query.fields = None es_query.script_fields = None source = "_source" elif s.value == ".": es_query.fields = None es_query.script_fields = None source = "_source" elif isinstance(s.value, basestring) and is_keyword(s.value): es_query.fields.append(s.value) elif isinstance(s.value, list) and es_query.fields is not None: es_query.fields.extend(s.value) else: es_query.script_fields[literal_field(s.name)] = { "script": qb_expression_to_ruby(s.value) } return extract_rows(es, es_query, source, select, query)
def groupby_size(data, size): if hasattr(data, "next"): iterator = data elif hasattr(data, "__iter__"): iterator = data.__iter__() else: Log.error("do not know how to handle this type") done = DictList() def more(): output = DictList() for i in range(size): try: output.append(iterator.next()) except StopIteration: done.append(True) break return output # THIS IS LAZY i = 0 while True: output = more() yield (i, output) if len(done) > 0: break i += 1
def post(sql): # FIND OUT THE default DOMAIN SIZES result = self.db.column_query(sql) num_edges = len(edges) for e, edge in enumerate(edges): domain = edge.domain if domain.type == "default": domain.type = "set" parts = set(result[e]) domain.partitions = [{ "index": i, "value": p } for i, p in enumerate(parts)] domain.map = {p: i for i, p in enumerate(parts)} else: Log.error("Do not know what to do here, yet") # FILL THE DATA CUBE maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] cubes = DictList() for c, s in enumerate(select): data = Matrix(*[ len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges ]) for rownum, value in enumerate(result[c + num_edges]): coord = [m[r[rownum]] for m, r in maps] data[coord] = value cubes.append(data) if isinstance(query.select, list): return cubes else: return cubes[0]
def __init__(self, **desc): Domain.__init__(self, **desc) self.NULL = Null self.partitions = DictList() self.map = dict() self.map[None] = self.NULL
def _select(template, data, fields, depth): output = DictList() deep_path = [] deep_fields = UniqueIndex(["name"]) for d in data: if isinstance(d, Dict): Log.error("programmer error, _select can not handle Dict") record = template.copy() children = None for f in fields: index, c = _select_deep(d, f, depth, record) children = c if children is None else children if index: path = f.value[0:index:] if not deep_fields[f]: deep_fields.add( f) # KEEP TRACK OF WHICH FIELDS NEED DEEPER SELECT short = MIN([len(deep_path), len(path)]) if path[:short:] != deep_path[:short:]: Log.error( "Dangerous to select into more than one branch at time" ) if len(deep_path) < len(path): deep_path = path if not children: output.append(record) else: output.extend(_select(record, children, deep_fields, depth + 1)) return output
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = DictList() if isinstance(self.key, set): Log.error("problem") if isinstance(desc.partitions[0], basestring): # ASSMUE PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = ("value", ) self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p} self.partitions.append(part) self.map[p] = part self.order[p] = i elif desc.partitions and desc.dimension.fields and len( desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif desc.key == None: Log.error("Domains must have keys") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i elif all(p.esfilter for p in self.partitions): # EVERY PART HAS AN esfilter DEFINED, SO USE THEM for i, p in enumerate(self.partitions): p.dataIndex = i else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if isinstance(desc.partitions, list): self.partitions = desc.partitions.copy() else: Log.error("expecting a list of partitions")
def es_setop(es, query): es_query, filters = es14.util.es_query_template(query.frum.name) set_default(filters[0], simplify_esfilter(query.where.to_esfilter())) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = DictList() return extract_rows(es, es_query, query)
def _iter(): g = 0 out = DictList() try: for i, d in enumerate(data): out.append(d) if (i + 1) % max_size == 0: yield g, out g += 1 out = DictList() if out: yield g, out except Exception, e: if out: # AT LEAST TRY TO RETURN WHAT HAS BEEN PROCESSED SO FAR yield g, out Log.error("Problem inside qb.groupby", e)
def more(): output = DictList() for i in range(size): try: output.append(iterator.next()) except StopIteration: done.append(True) break return output
def es_fieldop(es, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) FromES.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } } FromES.size = coalesce(query.limit, 200000) FromES.fields = DictList() for s in select.value: if s == "*": FromES.fields = None elif isinstance(s, list): FromES.fields.extend(s) elif isinstance(s, Mapping): FromES.fields.extend(s.values()) else: FromES.fields.append(s) FromES.sort = [{ s.field: "asc" if s.sort >= 0 else "desc" } for s in query.sort] data = es09.util.post(es, FromES, query.limit) T = data.hits.hits matricies = {} for s in select: if s.value == "*": matricies[s.name] = Matrix.wrap([t._source for t in T]) elif isinstance(s.value, Mapping): # for k, v in s.value.items(): # matricies[join_field(split_field(s.name)+[k])] = Matrix.wrap([unwrap(t.fields)[v] for t in T]) matricies[s.name] = Matrix.wrap([{ k: unwrap(t.fields).get(v, None) for k, v in s.value.items() } for t in T]) elif isinstance(s.value, list): matricies[s.name] = Matrix.wrap([ tuple(unwrap(t.fields).get(ss, None) for ss in s.value) for t in T ]) elif not s.value: matricies[s.name] = Matrix.wrap( [unwrap(t.fields).get(s.value, None) for t in T]) else: try: matricies[s.name] = Matrix.wrap( [unwrap(t.fields).get(s.value, None) for t in T]) except Exception, e: Log.error("", e)
def groupby(self, edges): """ SLICE THIS CUBE IN TO ONES WITH LESS DIMENSIONALITY simple==True WILL HAVE GROUPS BASED ON PARTITION VALUE, NOT PARTITION OBJECTS """ edges = DictList([_normalize_edge(e) for e in edges]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[ getKey[i](p) for p in e.domain.partitions + ([None] if e.allowNulls else []) ] for i, e in enumerate(self.edges)] def coord2term(coord): output = wrap_leaves( {keys[i]: lookup[i][c] for i, c in enumerate(coord)}) return output if isinstance(self.select, list): selects = listwrap(self.select) index, v = zip(*self.data[selects[0].name].groupby(selector)) coord = wrap([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = zip(*self.data[s.name].group_by(selector)) values.append(v) output = zip(coord, [ Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values) ]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( (coord2term(coord), v) for coord, v in self.data[self.select.name].groupby(selector)) else: output = ( (coord2term(coord), Cube(self.select, remainder, v)) for coord, v in self.data[self.select.name].groupby(selector)) return output
def select(data, field_name): """ return list with values from field_name """ if isinstance(data, Cube): return data._select(_normalize_selects(field_name)) if isinstance(data, FlatList): return data.select(field_name) if isinstance(data, UniqueIndex): data = data._data.values( ) # THE SELECT ROUTINE REQUIRES dicts, NOT Dict WHILE ITERATING if isinstance(data, Mapping): return select_one(data, field_name) if isinstance(field_name, Mapping): field_name = wrap(field_name) if field_name.value in ["*", "."]: return data if field_name.value: # SIMPLIFY {"value":value} AS STRING field_name = field_name.value # SIMPLE PYTHON ITERABLE ASSUMED if isinstance(field_name, basestring): path = split_field(field_name) if len(path) == 1: return DictList([d[field_name] for d in data]) else: output = DictList() flat_list._select1(data, path, 0, output) return output elif isinstance(field_name, list): keys = [_select_a_field(wrap(f)) for f in field_name] return _select(Dict(), unwrap(data), keys, 0) else: keys = [_select_a_field(field_name)] return _select(Dict(), unwrap(data), keys, 0)
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if isinstance(where, Mapping): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception, e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = DictList() for k, v in where.terms.items(): if not isinstance(v, (list, set)): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if isinstance(edge, basestring): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception, e: Log.error("programmer error", e) fields = domain.dimension.fields if isinstance(fields, Mapping): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif isinstance(fields, list) and len(fields) == 1 and is_keyword( fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({ "or": [domain.getPartByKey(vv).esfilter for vv in v] }) return {"and": output}
def _aggop(self, query): """ SINGLE ROW RETURNED WITH AGGREGATES """ if isinstance(query.select, list): # RETURN SINGLE OBJECT WITH AGGREGATES for s in query.select: if s.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s) selects = DictList() for s in query.select: selects.append(aggregates[s.aggregate].replace("{{code}}", s.value) + " AS " + self.db.quote_column(s.name)) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} """, { "selects": SQL(",\n".join(selects)), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.filter) }) return sql, lambda sql: self.db.column(sql)[0] # RETURNING SINGLE OBJECT WITH AGGREGATE VALUES else: # RETURN SINGLE VALUE s0 = query.select if s0.aggregate not in aggregates: Log.error("Expecting all columns to have an aggregate: {{select}}", select=s0) select = aggregates[s0.aggregate].replace("{{code}}", s0.value) + " AS " + self.db.quote_column(s0.name) sql = expand_template(""" SELECT {{selects}} FROM {{table}} {{where}} """, { "selects": SQL(select), "table": self._subquery(query["from"])[0], "where": self._where2sql(query.where) }) def post(sql): result = self.db.column_query(sql) return result[0][0] return sql, post # RETURN SINGLE VALUE
def _groupby(self, edges): """ RETURNS LIST OF (coord, values) TUPLES, WHERE coord IS THE INDEX INTO self CUBE (-1 INDEX FOR COORDINATES NOT GROUPED BY) values ALL VALUES THAT BELONG TO THE SLICE """ edges = DictList([_normalize_edge(e) for e in edges]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[getKey[i](p) for p in e.domain.partitions+([None] if e.allowNulls else [])] for i, e in enumerate(self.edges)] if isinstance(self.select, list): selects = listwrap(self.select) index, v = zip(*self.data[selects[0].name].groupby(selector)) coord = wrap([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = zip(*self.data[s.name].group_by(selector)) values.append(v) output = zip(coord, [Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values)]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( ( coord2term(coord), v ) for coord, v in self.data[self.select.name].groupby(selector) ) else: output = ( ( coord2term(coord), Cube(self.select, remainder, v) ) for coord, v in self.data[self.select.name].groupby(selector) ) return output
def sort(data, fieldnames=None, already_normalized=False): """ PASS A FIELD NAME, OR LIST OF FIELD NAMES, OR LIST OF STRUCTS WITH {"field":field_name, "sort":direction} """ try: if data == None: return Null if not fieldnames: return wrap(sorted(data, value_compare)) if already_normalized: formal = fieldnames else: formal = query._normalize_sort(fieldnames) funcs = [(jx_expression_to_function(f.value), f.sort) for f in formal] def comparer(left, right): for func, sort_ in funcs: try: result = value_compare(func(left), func(right), sort_) if result != 0: return result except Exception, e: Log.error("problem with compare", e) return 0 if isinstance(data, list): output = DictList([unwrap(d) for d in sorted(data, cmp=comparer)]) elif hasattr(data, "__iter__"): output = DictList( [unwrap(d) for d in sorted(list(data), cmp=comparer)]) else: Log.error("Do not know how to handle") output = None return output
def wrap(v): type_ = _get(v, "__class__") if type_ is dict: m = Dict(v) return m elif type_ is NoneType: return Null elif type_ is list: return DictList(v) elif type_ is GeneratorType: return (wrap(vv) for vv in v) else: return v
def _getAllEdges(facetEdges, edgeDepth): """ RETURN ALL PARTITION COMBINATIONS: A LIST OF ORDERED TUPLES """ if edgeDepth == len(facetEdges): return [()] edge = facetEdges[edgeDepth] deeper = _getAllEdges(facetEdges, edgeDepth + 1) output = DictList() partitions = edge.domain.partitions for part in partitions: for deep in deeper: output.append((part, ) + deep) return output
def es_deepop(es, mvel, query): FromES = es09.util.build_es_query(query) select = query.edges temp_query = query.copy() temp_query.select = select temp_query.edges = DictList() FromES.facets.mvel = { "terms": { "script_field": mvel.code(temp_query), "size": query.limit }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) rows = unpack_terms(data.facets.mvel, query.edges) terms = zip(*rows) # NUMBER ALL EDGES FOR JSON EXPRESSION INDEXING edges = query.edges for f, e in enumerate(edges): for r in terms[f]: e.domain.getPartByKey(r) e.index = f for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE dims = [len(e.domain.partitions) for e in query.edges] output = Matrix(*dims) # FILL CUBE for r in rows: term_coord = [ e.domain.getPartByKey(r[i]).dataIndex for i, e in enumerate(edges) ] output[term_coord] = SUM(output[term_coord], r[-1]) cube = Cube(query.select, query.edges, {query.select.name: output}) cube.frum = query return cube
def wrap(v): type_ = _get(v, "__class__") if type_ is dict: m = Dict(v) return m # m = object.__new__(Dict) # object.__setattr__(m, "_dict", v) # return m elif type_ is NoneType: return Null elif type_ is list: return DictList(v) elif type_ is GeneratorType: return (wrap(vv) for vv in v) else: return v
def _tuple(template, data, fields, depth, output): deep_path = None deep_fields = DictList() for d in data: record = template for f in fields: index, children, record = _tuple_deep(d, f, depth, record) if index: path = f.value[0:index:] deep_fields.append(f) if deep_path and path != deep_path: Log.error("Dangerous to select into more than one branch at time") if not children: output.append(record) else: _tuple(record, children, deep_fields, depth + 1, output) return output
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) # GET IDS OF DOCUMENTS results = self._es.search({ "fields": [], "query": { "filtered": { "query": { "match_all": {} }, "filter": _normalize_where(command.where, self) } }, "size": 200000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = DictList() for k, v in command.set.items(): if not is_keyword(k): Log.error("Only support simple paths for now") scripts.append("ctx._source." + k + " = " + expressions.qb_expression_to_ruby(v) + ";\n") script = "".join(scripts) if results.hits.hits: command = [] for id in results.hits.hits._id: command.append({"update": {"_id": id}}) command.append({"script": script}) content = ("\n".join(convert.value2json(c) for c in command) + "\n").encode('utf-8') self._es.cluster._post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"})
def addParts(parentPart, childPath, count, index): """ BUILD A hierarchy BY REPEATEDLY CALLING self METHOD WITH VARIOUS childPaths count IS THE NUMBER FOUND FOR self PATH """ if index == None: index = 0 if index == len(childPath): return c = childPath[index] parentPart.count = coalesce(parentPart.count, 0) + count if parentPart.partitions == None: parentPart.partitions = DictList() for i, part in enumerate(parentPart.partitions): if part.name == c.name: addParts(part, childPath, count, index + 1) return parentPart.partitions.append(c) addParts(c, childPath, count, index + 1)
def normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"value": s, "sort": 1}) elif not s.field and not s.value and s.sort == None: #ASSUME {name: sort} FORM for n, v in s.items(): output.append({"value": n, "sort": sort_direction[v]}) else: output.append({ "value": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1) }) return wrap(output)