def select(self, select): selects = listwrap(select) if len(selects) == 1 and isinstance( selects[0].value, Variable) and selects[0].value.var == ".": new_schema = self.schema if selects[0].name == ".": return self else: new_schema = None if isinstance(select, list): push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def selector(d): output = Data() for n, p in push_and_pull: output[n] = unwraplist(p(wrap(d))) return unwrap(output) new_data = map(selector, self.data) else: select_value = jx_expression_to_function(select.value) new_data = map(select_value, self.data) return ListContainer("from " + self.name, data=new_data, schema=new_schema)
def select(self, select): selects = listwrap(select) if not all(isinstance(s.value, Variable) for s in selects): Log.error("selecting on structure, or expressions, not supported yet") if len(selects) == 1 and isinstance(selects[0].value, Variable) and selects[0].value.var == ".": new_schema = self.schema if selects[0].name == ".": return self else: new_schema = None if isinstance(select, list): push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def selector(d): output = Data() for n, p in push_and_pull: output[n] = p(wrap(d)) return unwrap(output) new_data = map(selector, self.data) else: select_value = jx_expression_to_function(select.value) new_data = map(select_value, self.data) return ListContainer("from "+self.name, data=new_data, schema=new_schema)
def select(self, select): selects = listwrap(select) if not all(isinstance(s.value, Variable) for s in selects): Log.error( "selecting on structure, or expressions, not supported yet") if len(selects) == 1 and isinstance( selects[0].value, Variable) and selects[0].value.var == ".": new_schema = self.schema if selects[0].name == ".": return self else: new_schema = None if isinstance(select, list): push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def selector(d): output = Dict() for n, p in push_and_pull: output[n] = p(wrap(d)) return unwrap(output) new_data = map(selector, self.data) else: select_value = jx_expression_to_function(select.value) new_data = map(select_value, self.data) return ListContainer("from " + self.name, data=new_data, schema=new_schema)
def make_accessor(e): d = e.domain if e.value: accessor = jx_expression_to_function(e.value) if e.allowNulls: def output1(row): return [d.getIndexByKey(accessor(row))] return output1 else: def output2(row): c = d.getIndexByKey(accessor(row)) if c == len(d.partitions): return [] else: return [c] return output2 elif e.range: for p in d.partitions: if p["max"] == None or p["min"] == None: Log.error( "Inclusive expects domain parts to have `min` and `max` properties" ) mi_accessor = jx_expression_to_function(e.range.min) ma_accessor = jx_expression_to_function(e.range.max) if e.range.mode == "inclusive": def output3(row): mi, ma = mi_accessor(row), ma_accessor(row) output = [ p.dataIndex for p in d.partitions if mi <= p["max"] and p["min"] < ma ] if e.allowNulls and not output: return [len(d.partitions)] # ENSURE THIS IS NULL return output return output3 else: def output4(row): mi, ma = mi_accessor(row), ma_accessor(row) var = d.key output = [ p.dataIndex for p in d.partitions if mi <= p[var] < ma ] if e.allowNulls and not output: return [len(d.partitions)] # ENSURE THIS IS NULL return output return output4
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ return list of (keys, values) pairs where group by the set of keys values IS LIST OF ALL data that has those keys contiguous - MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES """ if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) if isinstance(data, Container): return data.groupby(keys) try: keys = listwrap(keys) get_key = jx_expression_to_function(keys) if not contiguous: data = sorted(data, key=get_key) def _output(): for g, v in itertools.groupby(data, get_key): group = Dict() for k, gg in zip(keys, g): group[k] = gg yield (group, wrap(v)) return _output() except Exception, e: Log.error("Problem grouping", e)
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ return list of (keys, values) pairs where group by the set of keys values IS LIST OF ALL data that has those keys contiguous - MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES """ if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) if isinstance(data, Container): return data.groupby(keys) try: keys = listwrap(keys) get_key = jx_expression_to_function(keys) if not contiguous: data = sorted(data, key=get_key) return ((wrap({k: v for k, v in zip(keys, g)}), wrap(v)) for g, v in itertools.groupby(data, get_key)) except Exception, e: Log.error("Problem grouping", e)
def filter(data, where): """ where - a function that accepts (record, rownum, rows) and returns boolean """ if len(data) == 0 or where == None or where == TRUE_FILTER: return data if isinstance(data, Container): return data.filter(where) if isinstance(data, (list, set)): temp = jx_expression_to_function(where) dd = wrap(data) return wrap( [unwrap(d) for i, d in enumerate(data) if temp(wrap(d), i, dd)]) else: Log.error("Do not know how to handle type {{type}}", type=data.__class__.__name__) try: return drill_filter(where, data) except Exception, _: # WOW! THIS IS INEFFICIENT! return wrap([ unwrap(d) for d in drill_filter(where, [DataObject(d) for d in data]) ])
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ :param data: :param keys: :param size: :param min_size: :param max_size: :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES :return: return list of (keys, values) PAIRS, WHERE keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR values IS GENERATOR OF ALL VALUE THAT MATCH keys contiguous - """ if isinstance(data, Container): return data.groupby(keys) if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) try: keys = listwrap(keys) if not contiguous: from pyLibrary.queries import jx data = jx.sort(data, keys) if not data: return Null accessor = jx_expression_to_function(TupleOp( "tuple", keys)) # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__ def _output(): start = 0 prev = accessor(data[0]) for i, d in enumerate(data): curr = accessor(d) if curr != prev: group = {} for k, gg in zip(keys, prev): group[k] = gg yield Data(group), data[start:i:] start = i prev = curr group = {} for k, gg in zip(keys, prev): group[k] = gg yield Data(group), data[start::] return _output() except Exception, e: Log.error("Problem grouping", cause=e)
def window(data, param): """ MAYBE WE CAN DO THIS WITH NUMPY (no, the edges of windows are not graceful with numpy) data - list of records """ name = param.name # column to assign window function result edges = param.edges # columns to gourp by where = param.where # DO NOT CONSIDER THESE VALUES sortColumns = param.sort # columns to sort by calc_value = wrap_function(jx_expression_to_function(param.value)) # function that takes a record and returns a value (for aggregation) aggregate = param.aggregate # WindowFunction to apply _range = param.range # of form {"min":-10, "max":0} to specify the size and relative position of window data = filter(data, where) if not aggregate and not edges: if sortColumns: data = sort(data, sortColumns, already_normalized=True) # SIMPLE CALCULATED VALUE for rownum, r in enumerate(data): r[name] = calc_value(r, rownum, data) return if not aggregate or aggregate == "none": for _, values in groupby(data, edges.value): if not values: continue # CAN DO NOTHING WITH THIS ZERO-SAMPLE sequence = sort(values, sortColumns, already_normalized=True) for rownum, r in enumerate(sequence): r[name] = calc_value(r, rownum, sequence) return for keys, values in groupby(data, edges.value): if not values: continue # CAN DO NOTHING WITH THIS ZERO-SAMPLE sequence = sort(values, sortColumns) for rownum, r in enumerate(sequence): r["__temp__"] = calc_value(r, rownum, sequence) head = coalesce(_range.max, _range.stop) tail = coalesce(_range.min, _range.start) # PRELOAD total total = aggregate() for i in range(tail, head): total.add(sequence[i].__temp__) # WINDOW FUNCTION APPLICATION for i, r in enumerate(sequence): r[name] = total.end() total.add(sequence[i + head].__temp__) total.sub(sequence[i + tail].__temp__) for r in data: r["__temp__"] = None # CLEANUP
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ :param data: :param keys: :param size: :param min_size: :param max_size: :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES :return: return list of (keys, values) PAIRS, WHERE keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR values IS GENERATOR OF ALL VALUE THAT MATCH keys contiguous - """ if isinstance(data, Container): return data.groupby(keys) if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) try: keys = listwrap(keys) if not contiguous: from pyLibrary.queries import jx data = jx.sort(data, keys) if not data: return Null if any(isinstance(k, Expression) for k in keys): Log.error("can not handle expressions") else: accessor = jx_expression_to_function(jx_expression({"tuple": keys})) # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__ def _output(): start = 0 prev = accessor(data[0]) for i, d in enumerate(data): curr = accessor(d) if curr != prev: group = {} for k, gg in zip(keys, prev): group[k] = gg yield Data(group), data[start:i:] start = i prev = curr group = {} for k, gg in zip(keys, prev): group[k] = gg yield Data(group), data[start::] return _output() except Exception as e: Log.error("Problem grouping", cause=e)
def make_accessor(e): d = e.domain if e.value: accessor = jx_expression_to_function(e.value) if e.allowNulls: def output1(row): return [d.getIndexByKey(accessor(row))] return output1 else: def output2(row): c = d.getIndexByKey(accessor(row)) if c == len(d.partitions): return [] else: return [c] return output2 elif e.range: for p in d.partitions: if p["max"] == None or p["min"] == None: Log.error("Inclusive expects domain parts to have `min` and `max` properties") mi_accessor = jx_expression_to_function(e.range.min) ma_accessor = jx_expression_to_function(e.range.max) if e.range.mode == "inclusive": def output3(row): mi, ma = mi_accessor(row), ma_accessor(row) output = [p.dataIndex for p in d.partitions if mi <= p["max"] and p["min"] < ma] if e.allowNulls and not output: return [len(d.partitions)] # ENSURE THIS IS NULL return output return output3 else: def output4(row): mi, ma = mi_accessor(row), ma_accessor(row) var = d.key output = [p.dataIndex for p in d.partitions if mi <= p[var] < ma] if e.allowNulls and not output: return [len(d.partitions)] # ENSURE THIS IS NULL return output return output4
def groupby(self, keys, contiguous=False): try: keys = listwrap(keys) get_key = jx_expression_to_function(keys) if not contiguous: data = sorted(self.data, key=get_key) def _output(): for g, v in itertools.groupby(data, get_key): group = Dict() for k, gg in zip(keys, g): group[k] = gg yield (group, wrap(list(v))) return _output() except Exception, e: Log.error("Problem grouping", e)
def groupby(self, keys, contiguous=False): try: keys = listwrap(keys) get_key = jx_expression_to_function(keys) if not contiguous: data = sorted(self.data, key=get_key) def _output(): for g, v in itertools.groupby(data, get_key): group = Data() for k, gg in zip(keys, g): group[k] = gg yield (group, wrap(list(v))) return _output() except Exception, e: Log.error("Problem grouping", e)
def select(self, select): selects = listwrap(select) if len(selects) == 1 and isinstance(selects[0].value, Variable) and selects[0].value.var == ".": new_schema = self.schema if selects[0].name == ".": return self else: new_schema = None push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def constructor(d): output = Dict() for n, p in push_and_pull: output[n] = p(d) return _get(d, "_dict") new_data = map(constructor, self.data) return ListContainer("from " + self.name, data=new_data, schema=new_schema)
def es_aggop(es, mvel, query): select = listwrap(query.select) FromES = build_es_query(query) isSimple = AND(aggregates[s.aggregate] == "count" for s in select) if isSimple: return es_countop(es, query) # SIMPLE, USE TERMS FACET INSTEAD value2facet = dict() # ONLY ONE FACET NEEDED PER name2facet = dict() # MAP name TO FACET WITH STATS for s in select: if s.value not in value2facet: if isinstance(s.value, Variable): unwrap(FromES.facets)[s.name] = { "statistical": { "field": s.value.var }, "facet_filter": simplify_esfilter(query.where.to_esfilter()) } else: unwrap(FromES.facets)[s.name] = { "statistical": { "script": jx_expression_to_function(s.value) }, "facet_filter": simplify_esfilter(query.where) } value2facet[s.value] = s.name name2facet[s.name] = value2facet[s.value] data = es09.util.post(es, FromES, query.limit) matricies = { s.name: Matrix(value=fix_es_stats(data.facets[literal_field(s.name)])[ aggregates[s.aggregate]]) for s in select } cube = Cube(query.select, [], matricies) cube.frum = query return cube
def sort(data, fieldnames=None, already_normalized=False): """ PASS A FIELD NAME, OR LIST OF FIELD NAMES, OR LIST OF STRUCTS WITH {"field":field_name, "sort":direction} """ try: if data == None: return Null if not fieldnames: return wrap(sorted(data, value_compare)) if already_normalized: formal = fieldnames else: formal = query._normalize_sort(fieldnames) funcs = [(jx_expression_to_function(f.value), f.sort) for f in formal] def comparer(left, right): for func, sort_ in funcs: try: result = value_compare(func(left), func(right), sort_) if result != 0: return result except Exception as e: Log.error("problem with compare", e) return 0 if isinstance(data, list): output = FlatList([unwrap(d) for d in sorted(data, cmp=comparer)]) elif hasattr(data, "__iter__"): output = FlatList( [unwrap(d) for d in sorted(list(data), cmp=comparer)]) else: Log.error("Do not know how to handle") output = None return output except Exception as e: Log.error("Problem sorting\n{{data}}", data=data, cause=e)
def sort(data, fieldnames=None, already_normalized=False): """ PASS A FIELD NAME, OR LIST OF FIELD NAMES, OR LIST OF STRUCTS WITH {"field":field_name, "sort":direction} """ try: if data == None: return Null if not fieldnames: return wrap(sorted(data, value_compare)) if already_normalized: formal = fieldnames else: formal = query._normalize_sort(fieldnames) funcs = [(jx_expression_to_function(f.value), f.sort) for f in formal] def comparer(left, right): for func, sort_ in funcs: try: result = value_compare(func(left), func(right), sort_) if result != 0: return result except Exception as e: Log.error("problem with compare", e) return 0 if isinstance(data, list): output = FlatList([unwrap(d) for d in sorted(data, cmp=comparer)]) elif hasattr(data, "__iter__"): output = FlatList([unwrap(d) for d in sorted(list(data), cmp=comparer)]) else: Log.error("Do not know how to handle") output = None return output except Exception as e: Log.error("Problem sorting\n{{data}}", data=data, cause=e)
def filter(data, where): """ where - a function that accepts (record, rownum, rows) and returns boolean """ if len(data) == 0 or where == None or where == TRUE_FILTER: return data if isinstance(data, Container): return data.filter(where) if isinstance(data, (list, set)): temp = jx_expression_to_function(where) dd = wrap(data) return [d for i, d in enumerate(data) if temp(wrap(d), i, dd)] else: Log.error("Do not know how to handle type {{type}}", type=data.__class__.__name__) try: return drill_filter(where, data) except Exception, _: # WOW! THIS IS INEFFICIENT! return wrap([unwrap(d) for d in drill_filter(where, [DictObject(d) for d in data])])
def es_aggop(es, mvel, query): select = listwrap(query.select) FromES = build_es_query(query) isSimple = AND(aggregates[s.aggregate] == "count" for s in select) if isSimple: return es_countop(es, query) # SIMPLE, USE TERMS FACET INSTEAD value2facet = dict() # ONLY ONE FACET NEEDED PER name2facet = dict() # MAP name TO FACET WITH STATS for s in select: if s.value not in value2facet: if isinstance(s.value, Variable): unwrap(FromES.facets)[s.name] = { "statistical": { "field": s.value.var }, "facet_filter": simplify_esfilter(query.where.to_esfilter()) } else: unwrap(FromES.facets)[s.name] = { "statistical": { "script": jx_expression_to_function(s.value) }, "facet_filter": simplify_esfilter(query.where) } value2facet[s.value] = s.name name2facet[s.name] = value2facet[s.value] data = es09.util.post(es, FromES, query.limit) matricies = {s.name: Matrix(value=fix_es_stats(data.facets[literal_field(s.name)])[aggregates[s.aggregate]]) for s in select} cube = Cube(query.select, [], matricies) cube.frum = query return cube
def select(self, select): selects = listwrap(select) if len(selects) == 1 and isinstance( selects[0].value, Variable) and selects[0].value.var == ".": new_schema = self.schema if selects[0].name == ".": return self else: new_schema = None push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def constructor(d): output = Dict() for n, p in push_and_pull: output[n] = p(d) return _get(d, "_dict") new_data = map(constructor, self.data) return ListContainer("from " + self.name, data=new_data, schema=new_schema)
def cube_aggs(frum, query): select = listwrap(query.select) #MATCH EDGES IN QUERY TO ONES IN frum for e in query.edges: for fs in frum.select: if fs.name == e.value: Log.error("Not implemented yet") if isinstance(e.domain, DefaultDomain): # DEFAULT DOMAINS CAN EASILY BE LOOKED UP FROM frum for fe in frum.edges: if fe.name == e.value: e.domain = SimpleSetDomain(**fe.domain.as_dict()) e.value = e.value + "." + fe.domain.key break else: for fe in frum.edges: if fe.name == e.value: e.value = e.value + "." + fe.domain.key break result = { s.name: Matrix(dims=[ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ], zeros=s.default) for s in select } where = jx_expression_to_function(query.where) for d in filter(where, frum.values()): coord = [ ] # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE for e in query.edges: matches = get_matches(e, d) coord.append(matches) if len(matches) == 1 and d[e.name] == None: d[e.name] = e.domain.partitions[matches[0]] for s in select: mat = result[s.name] agg = s.aggregate var = s.value expr = jx_expression_to_function(var) val = expr(d) if agg == "count": if var == "." or var == None: for c in itertools.product(*coord): mat[c] += 1 continue if val != None: for c in itertools.product(*coord): mat[c] += 1 else: for c in itertools.product(*coord): acc = mat[c] if acc == None: acc = windows.name2accumulator.get(agg) if acc == None: Log.error( "select aggregate {{agg}} is not recognized", agg=agg) acc = acc(**s) mat[c] = acc acc.add(val) for s in select: if s.aggregate == "count": continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() from pyLibrary.queries.containers.cube import Cube return Cube(select, query.edges, result)
def cube_aggs(frum, query): select = listwrap(query.select) #MATCH EDGES IN QUERY TO ONES IN frum for e in query.edges: for fs in frum.select: if fs.name == e.value: Log.error("Not implemented yet") if isinstance(e.domain, DefaultDomain): # DEFAULT DOMAINS CAN EASILY BE LOOKED UP FROM frum for fe in frum.edges: if fe.name == e.value: e.domain = SimpleSetDomain(**fe.domain.as_dict()) e.value = e.value + "." + fe.domain.key break else: for fe in frum.edges: if fe.name == e.value: e.value = e.value + "." + fe.domain.key break result = { s.name: Matrix( dims=[len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges], zeros=s.default ) for s in select } where = jx_expression_to_function(query.where) for d in filter(where, frum.values()): coord = [] # LIST OF MATCHING COORDINATE FAMILIES, USUALLY ONLY ONE PER FAMILY BUT JOINS WITH EDGES CAN CAUSE MORE for e in query.edges: matches = get_matches(e, d) coord.append(matches) if len(matches) == 1 and d[e.name] == None: d[e.name] = e.domain.partitions[matches[0]] for s in select: mat = result[s.name] agg = s.aggregate var = s.value expr = jx_expression_to_function(var) val = expr(d) if agg == "count": if var == "." or var == None: for c in itertools.product(*coord): mat[c] += 1 continue if val != None: for c in itertools.product(*coord): mat[c] += 1 else: for c in itertools.product(*coord): acc = mat[c] if acc == None: acc = windows.name2accumulator.get(agg) if acc == None: Log.error("select aggregate {{agg}} is not recognized", agg= agg) acc = acc(**s) mat[c] = acc acc.add(val) for s in select: if s.aggregate == "count": continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() from pyLibrary.queries.containers.cube import Cube return Cube(select, query.edges, result)
def list_aggs(frum, query): frum = wrap(frum) select = listwrap(query.select) for e in query.edges: if isinstance(e.domain, DefaultDomain): accessor = jx_expression_to_function(e.value) unique_values = set(map(accessor, frum)) if None in unique_values: e.allowNulls = coalesce(e.allowNulls, True) unique_values -= {None} e.domain = SimpleSetDomain(partitions=list(sorted(unique_values))) else: pass s_accessors = [(ss.name, compile_expression(ss.value.to_python())) for ss in select] result = { s.name: Matrix( dims=[len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges], zeros=lambda: windows.name2accumulator.get(s.aggregate)(**s) ) for s in select } where = jx_expression_to_function(query.where) coord = [None]*len(query.edges) edge_accessor = [(i, make_accessor(e)) for i, e in enumerate(query.edges)] net_new_edge_names = set(wrap(query.edges).name) - UNION(e.value.vars() for e in query.edges) if net_new_edge_names & UNION(ss.value.vars() for ss in select): # s_accessor NEEDS THESE EDGES, SO WE PASS THEM ANYWAY for d in filter(where, frum): d = d.copy() for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] for e, cc in zip(query.edges, c): d[e.name] = e.domain.partitions[cc] val = s_accessor(d, c, frum) acc.add(val) else: # FASTER for d in filter(where, frum): for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] val = s_accessor(d, c, frum) acc.add(val) for s in select: # if s.aggregate == "count": # continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() from pyLibrary.queries.containers.cube import Cube output = Cube(select, query.edges, result) return output
def get(expr): """ RETURN FUNCTION FOR EXPRESSION """ return jx_expression_to_function(expr)