def __init__(self, query_path, snowflake): if not is_list(snowflake.query_paths[0]): Log.error("Snowflake query paths should be a list of string tuples (well, technically, a list of lists of strings)") self.snowflake = snowflake try: path = [ p for p in snowflake.query_paths if untype_path(p[0]) == query_path ] if path: # WE DO NOT NEED TO LOOK INTO MULTI-VALUED FIELDS AS A TABLE self.multi = None self.query_path = path[0] else: # LOOK INTO A SPECIFIC MULTI VALUED COLUMN try: self.multi = [ c for c in self.snowflake.columns if untype_path(c.name) == query_path and c.multi > 1 ][0] self.query_path = [self.multi.name] + self.multi.nested_path except Exception as e: # PROBLEM WITH METADATA UPDATE self.multi = None self.query_path = [query_path] + ["."] Log.warning("Problem getting query path {{path|quote}} in snowflake {{sf|quote}}", path=query_path, sf=snowflake.name, cause=e) if not is_list(self.query_path) or self.query_path[len(self.query_path) - 1] != ".": Log.error("error") except Exception as e: Log.error("logic error", cause=e)
def write(self, content): """ :param content: text, or iterable of text :return: """ if not self.parent.exists: self.parent.create() with open(self._filename, "wb") as f: if is_list(content) and self.key: Log.error(u"list of data and keys are not supported, encrypt before sending to file") if is_list(content): pass elif isinstance(content, text): content = [content] elif hasattr(content, "__iter__"): pass for d in content: if not is_text(d): Log.error(u"Expecting unicode data only") if self.key: from mo_math.aes_crypto import encrypt f.write(encrypt(d, self.key).encode("utf8")) else: f.write(d.encode("utf8"))
def _normalize_selects( selects, frum, schema=None, ): if frum == None or isinstance(frum, (list, set, text)): if is_list(selects): if len(selects) == 0: return Null else: output = [ _normalize_select_no_context(s, schema=schema) for s in selects ] else: return _normalize_select_no_context(selects, schema=schema) elif is_list(selects): output = [ ss for s in selects for ss in _normalize_select(s, frum=frum, schema=schema) ] else: output = _normalize_select(selects, frum, schema=schema) exists = set() for s in output: if s.name in exists: Log.error("{{name}} has already been defined", name=s.name) exists.add(s.name) return output
def compare_to_expected(query, result, expect, places): query = wrap(query) expect = wrap(expect) if result.meta.format == "table": try: assertAlmostEqual(set(result.header), set(expect.header)) except Exception as e: Log.error("format=table headers do not match", cause=e) # MAP FROM expected COLUMN TO result COLUMN mapping = transpose(*transpose(*filter( lambda v: v[0][1] == v[1][1], itertools.product(enumerate(expect.header), enumerate(result.header)) ))[1])[0] result.header = [result.header[m] for m in mapping] if result.data: columns = transpose(*unwrap(result.data)) result.data = transpose(*(columns[m] for m in mapping)) if not query.sort: sort_table(result) sort_table(expect) elif result.meta.format == "list": if not query.sort: try: # result.data MAY BE A LIST OF VALUES, NOT OBJECTS data_columns = jx.sort(set(jx.get_columns(result.data, leaves=True)) | set(jx.get_columns(expect.data, leaves=True)), "name") except Exception: data_columns = [{"name": "."}] sort_order = listwrap(coalesce(query.edges, query.groupby)) + data_columns if is_list(expect.data): try: expect.data = jx.sort(expect.data, sort_order.name) except Exception as _: pass if is_list(result.data): try: result.data = jx.sort(result.data, sort_order.name) except Exception as _: pass elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort: result_data, result_header = cube2list(result.data) result_header = map(literal_field, result_header) result_data = unwrap(jx.sort(result_data, result_header)) result.data = list2cube(result_data, result_header) expect_data, expect_header = cube2list(expect.data) expect_header = map(literal_field, expect_header) expect_data = jx.sort(expect_data, expect_header) expect.data = list2cube(expect_data, expect_header) # CONFIRM MATCH assertAlmostEqual(result, expect, places=places)
def parse_sql(sql): # TODO: CONVERT tuple OF LITERALS INTO LITERAL LIST # # IF ALL MEMBERS OF A LIST ARE LITERALS, THEN MAKE THE LIST LITERAL # if all(isinstance(r, number_types) for r in output): # pass # elif all(isinstance(r, number_types) or (is_data(r) and "literal" in r.keys()) for r in output): # output = {"literal": [r['literal'] if is_data(r) else r for r in output]} query = wrap(moz_sql_parser.parse(sql)) redundant_select = [] # PULL OUT THE AGGREGATES for s in listwrap(query.select): val = s if s == '*' else s.value # EXTRACT KNOWN AGGREGATE FUNCTIONS if is_data(val): for a in KNOWN_SQL_AGGREGATES: value = val[a] if value != None: if is_list(value): # AGGREGATE WITH PARAMETERS EG percentile(value, 0.90) s.aggregate = a s[a] = unwraplist(value[1::]) s.value = value[0] else: # SIMPLE AGGREGATE s.aggregate = a s.value = value break # LOOK FOR GROUPBY COLUMN IN SELECT CLAUSE, REMOVE DUPLICATION for g in listwrap(query.groupby): try: assertAlmostEqual(g.value, val, "") g.name = s.name redundant_select.append(s) break except Exception: pass # REMOVE THE REDUNDANT select if is_list(query.select): for r in redundant_select: query.select.remove(r) elif query.select and redundant_select: query.select = None # RENAME orderby TO sort query.sort, query.orderby = query.orderby, None query.format = "table" return query
def format_list(aggs, es_query, query, decoders, select): table = format_table(aggs, es_query, query, decoders, select) header = table.header if query.edges or query.groupby: data = [] for row in table.data: d = Data() for h, r in zip(header, row): d[h] = r data.append(d) format = "list" elif is_list(query.select): data = Data() for h, r in zip(header, table.data[0]): data[h] = r format = "value" else: data = table.data[0][0] format = "value" output = Data( meta={"format": format}, data=data ) return output
def assertAlmostEqualValue(test, expected, digits=None, places=None, msg=None, delta=None): """ Snagged from unittest/case.py, then modified (Aug2014) """ if expected is NULL: if test == None: # pandas dataframes reject any comparision with an exception! return else: raise AssertionError(expand_template("{{test}} != {{expected}}", locals())) if expected == None: # None has no expectations return if test == expected: # shortcut return if not is_number(expected): # SOME SPECIAL CASES, EXPECTING EMPTY CONTAINERS IS THE SAME AS EXPECTING NULL if is_list(expected) and len(expected) == 0 and test == None: return if is_data(expected) and not expected.keys() and test == None: return if test != expected: raise AssertionError(expand_template("{{test}} != {{expected}}", locals())) return num_param = 0 if digits != None: num_param += 1 if places != None: num_param += 1 if delta != None: num_param += 1 if num_param>1: raise TypeError("specify only one of digits, places or delta") if digits is not None: with suppress_exception: diff = log10(abs(test-expected)) if diff < digits: return standardMsg = expand_template("{{test}} != {{expected}} within {{digits}} decimal places", locals()) elif delta is not None: if abs(test - expected) <= delta: return standardMsg = expand_template("{{test}} != {{expected}} within {{delta}} delta", locals()) else: if places is None: places = 15 with suppress_exception: diff = mo_math.log10(abs(test-expected)) if diff < mo_math.ceiling(mo_math.log10(abs(test)))-places: return standardMsg = expand_template("{{test|json}} != {{expected|json}} within {{places}} places", locals()) raise AssertionError(coalesce(msg, "") + ": (" + standardMsg + ")")
def select(self, select): selects = listwrap(select) if len(selects) == 1 and is_op(selects[0].value, Variable) and selects[0].value.var == ".": new_schema = self.schema if selects[0].name == ".": return self else: new_schema = None if is_list(select): if all( is_op(s.value, Variable) and s.name == s.value.var for s in select ): names = set(s.value.var for s in select) new_schema = Schema(".", [c for c in self.schema.columns if c.name in names]) push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def selector(d): output = Data() for n, p in push_and_pull: output[n] = unwraplist(p(wrap(d))) return unwrap(output) new_data = map(selector, self.data) else: select_value = jx_expression_to_function(select.value) new_data = map(select_value, self.data) if is_op(select.value, Variable): column = copy(first(c for c in self.schema.columns if c.name == select.value.var)) column.name = '.' new_schema = Schema("from " + self.name, [column]) return ListContainer("from "+self.name, data=new_data, schema=new_schema)
def _update_meta(self): if not self.dirty: return for mcl in self.data.get("meta.columns").values(): for mc in mcl: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.name] if value == None: pass else: count += 1 if is_list(value): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif is_data(value): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = Date.now() self.dirty = False
def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if is_data(field_name) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING field_name = field_name["value"] # SIMPLE PYTHON ITERABLE ASSUMED if is_text(field_name): if len(split_field(field_name)) == 1: return [(d[field_name],) for d in data] else: path = split_field(field_name) output = [] flat_list._tuple1(data, path, 0, output) return output elif is_list(field_name): paths = [_select_a_field(f) for f in field_name] output = FlatList() _tuple((), unwrap(data), paths, 0, output) return output else: paths = [_select_a_field(field_name)] output = FlatList() _tuple((), data, paths, 0, output) return output
def _expand(template, seq): """ seq IS TUPLE OF OBJECTS IN PATH ORDER INTO THE DATA TREE """ if is_text(template): return _simple_expand(template, seq) elif is_data(template): # EXPAND LISTS OF ITEMS USING THIS FORM # {"from":from, "template":template, "separator":separator} template = wrap(template) assert template["from"], "Expecting template to have 'from' attribute" assert template.template, "Expecting template to have 'template' attribute" data = seq[-1][template["from"]] output = [] for d in data: s = seq + (d,) output.append(_expand(template.template, s)) return coalesce(template.separator, "").join(output) elif is_list(template): return "".join(_expand(t, seq) for t in template) else: if not _Log: _late_import() _Log.error("can not handle")
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception as e: Log.error("{{name}} does not exist", name=fieldname) if is_list(d) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1 :]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d]) return fieldname, None
def _select_deep(v, field, depth, record): """ field = {"name":name, "value":["attribute", "path"]} r[field.name]=v[field.value], BUT WE MUST DEAL WITH POSSIBLE LIST IN field.value PATH """ if hasattr(field.value, "__call__"): try: record[field.name] = field.value(wrap(v)) except Exception as e: record[field.name] = None return 0, None for i, f in enumerate(field.value[depth : len(field.value) - 1 :]): v = v.get(f) if v is None: return 0, None if is_list(v): return depth + i + 1, v f = field.value.last() try: if not f: # NO NAME FIELD INDICATES SELECT VALUE record[field.name] = v else: record[field.name] = v.get(f) except Exception as e: Log.error( "{{value}} does not have {{field}} property", value=v, field=f, cause=e ) return 0, None
def _select_deep(v, field, depth, record): """ field = {"name":name, "value":["attribute", "path"]} r[field.name]=v[field.value], BUT WE MUST DEAL WITH POSSIBLE LIST IN field.value PATH """ if hasattr(field.value, "__call__"): try: record[field.name] = field.value(wrap(v)) except Exception as e: record[field.name] = None return 0, None for i, f in enumerate(field.value[depth:len(field.value) - 1:]): v = v.get(f) if v is None: return 0, None if is_list(v): return depth + i + 1, v f = field.value.last() try: if not f: # NO NAME FIELD INDICATES SELECT VALUE record[field.name] = v else: record[field.name] = v.get(f) except Exception as e: Log.error("{{value}} does not have {{field}} property", value=v, field=f, cause=e) return 0, None
def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if is_data(field_name) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING field_name = field_name["value"] # SIMPLE PYTHON ITERABLE ASSUMED if is_text(field_name): if len(split_field(field_name)) == 1: return [(d[field_name], ) for d in data] else: path = split_field(field_name) output = [] flat_list._tuple1(data, path, 0, output) return output elif is_list(field_name): paths = [_select_a_field(f) for f in field_name] output = FlatList() _tuple((), unwrap(data), paths, 0, output) return output else: paths = [_select_a_field(field_name)] output = FlatList() _tuple((), data, paths, 0, output) return output
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception as e: Log.error("{{name}} does not exist", name=fieldname) if is_list(d) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1:]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d]) return fieldname, None
def _expand(template, seq): """ seq IS TUPLE OF OBJECTS IN PATH ORDER INTO THE DATA TREE """ if is_text(template): return _simple_expand(template, seq) elif is_data(template): # EXPAND LISTS OF ITEMS USING THIS FORM # {"from":from, "template":template, "separator":separator} template = to_data(template) assert template["from"], "Expecting template to have 'from' attribute" assert template.template, "Expecting template to have 'template' attribute" data = seq[-1][template["from"]] output = [] for d in data: s = seq + (d, ) output.append(_expand(template.template, s)) return coalesce(template.separator, "").join(output) elif is_list(template): return "".join(_expand(t, seq) for t in template) else: if not _Log: _late_import() _Log.error("can not handle")
def _update_meta(self): if not self.dirty: return now = Date.now() for mc in META_COLUMNS_DESC.columns: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.name] if value == None: pass else: count += 1 if is_list(value): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif is_data(value): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = now META_COLUMNS_DESC.last_updated = now self.dirty = False
def select(self, select): selects = listwrap(select) if len(selects) == 1 and is_op(selects[0].value, Variable) and selects[0].value.var == ".": new_schema = self.schema if selects[0].name == ".": return self else: new_schema = None if is_list(select): if all( is_op(s.value, Variable) and s.name == s.value.var for s in select ): names = set(s.value.var for s in select) new_schema = Schema(".", [c for c in self.schema.columns if c.name in names]) push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def selector(d): output = Data() for n, p in push_and_pull: output[n] = unwraplist(p(to_data(d))) return unwrap(output) new_data = list(map(selector, self.data)) else: select_value = jx_expression_to_function(select.value) new_data = list(map(select_value, self.data)) if is_op(select.value, Variable): column = dict(**first(c for c in self.schema.columns if c.name == select.value.var)) column.update({"name": ".", "jx_type": NESTED, "es_type": "nested", "multi":1001, "cardinality":1}) new_schema = Schema("from " + self.name, [Column(**column)]) return ListContainer("from "+self.name, data=new_data, schema=new_schema)
def command_loop(local): DEBUG and Log.note("mo-python process running with {{config|json}}", config=local['config']) while not please_stop: line = sys.stdin.readline() try: command = json2value(line.decode('utf8')) DEBUG and Log.note("got {{command}}", command=command) if "import" in command: dummy = {} if is_text(command['import']): exec("from " + command['import'] + " import *", dummy, context) else: exec( "from " + command['import']['from'] + " import " + ",".join(listwrap(command['import']['vars'])), dummy, context) STDOUT.write(DONE) elif "set" in command: for k, v in command.set.items(): context[k] = v STDOUT.write(DONE) elif "get" in command: STDOUT.write( value2json({ "out": coalesce(local.get(command['get']), context.get(command['get'])) })) STDOUT.write('\n') elif "stop" in command: STDOUT.write(DONE) please_stop.go() elif "exec" in command: if not is_text(command['exec']): Log.error("exec expects only text") exec(command['exec'], context, local) STDOUT.write(DONE) else: for k, v in command.items(): if is_list(v): exec( "_return = " + k + "(" + ",".join(map(value2json, v)) + ")", context, local) else: exec( "_return = " + k + "(" + ",".join(kk + "=" + value2json(vv) for kk, vv in v.items()) + ")", context, local) STDOUT.write(value2json({"out": local['_return']})) STDOUT.write('\n') except Exception as e: STDOUT.write(value2json({"err": e})) STDOUT.write('\n') finally: STDOUT.flush()
def _replace_ref(node, url): if url.path.endswith("/"): url.path = url.path[:-1] if is_data(node): ref = None output = {} for k, v in node.items(): if k == "$ref": ref = URL(v) else: output[k] = _replace_ref(v, url) if not ref: return output node = output if not ref.scheme and not ref.path: # DO NOT TOUCH LOCAL REF YET output["$ref"] = ref return output if not ref.scheme: # SCHEME RELATIVE IMPLIES SAME PROTOCOL AS LAST TIME, WHICH # REQUIRES THE CURRENT DOCUMENT'S SCHEME ref.scheme = url.scheme # FIND THE SCHEME AND LOAD IT if ref.scheme in scheme_loaders: new_value = scheme_loaders[ref.scheme](ref, url) else: raise Log.error("unknown protocol {{scheme}}", scheme=ref.scheme) if ref.fragment: new_value = get_attr(new_value, ref.fragment) DEBUG and Log.note( "Replace {{ref}} with {{new_value}}", ref=ref, new_value=new_value) if not output: output = new_value elif is_text(output): Log.error("Can not handle set_default({{output}},{{new_value}})", output=output, new_value=new_value) else: output = unwrap(set_default(output, new_value)) DEBUG and Log.note("Return {{output}}", output=output) return output elif is_list(node): output = [_replace_ref(n, url) for n in node] # if all(p[0] is p[1] for p in zip(output, node)): # return node return output return node
def value(self): if self.is_none: return Null if self.edges: Log.error("can not get value of with dimension") if is_list(self.select): Log.error("can not get value of multi-valued cubes") return self.data[self.select.name].cube
def parse_hg_date(date): if is_text(date): return Date(date) elif is_list(date): # FIRST IN TUPLE (timestamp, time_zone) TUPLE, WHERE timestamp IS GMT return Date(date[0]) else: Log.error("Can not deal with date like {{date|json}}", date=date)
def error( cls, template, # human readable template default_params={}, # parameters for template cause=None, # pausible cause stack_depth=0, **more_params): """ raise an exception with a trace for the cause too :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if not is_text(template): # sys.stderr.write(str("Log.error was expecting a unicode template")) Log.error("Log.error was expecting a unicode template") if default_params and isinstance( listwrap(default_params)[0], BaseException): cause = default_params default_params = {} params = Data(dict(default_params, **more_params)) add_to_trace = False if cause == None: causes = None elif is_list(cause): causes = [] for c in listwrap( cause ): # CAN NOT USE LIST-COMPREHENSION IN PYTHON3 (EXTRA STACK DEPTH FROM THE IN-LINED GENERATOR) causes.append(Except.wrap(c, stack_depth=1)) causes = FlatList(causes) elif isinstance(cause, BaseException): causes = Except.wrap(cause, stack_depth=1) else: causes = None Log.error("can only accept Exception, or list of exceptions") trace = exceptions.get_stacktrace(stack_depth + 1) if add_to_trace: cause[0].trace.extend(trace[1:]) e = Except( context=exceptions.ERROR, template=template, params=params, cause=causes, trace=trace, ) raise_from_none(e)
def select(self, fields): if is_data(fields): fields=fields.value if is_text(fields): # RETURN LIST OF VALUES if len(split_field(fields)) == 1: if self.path[0] == fields: return [d[1] for d in self.data] else: return [d[0][fields] for d in self.data] else: keys = split_field(fields) depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX short_key = keys[depth:] output = FlatList() _select1((wrap(d[depth]) for d in self.data), short_key, 0, output) return output if is_list(fields): output = FlatList() meta = [] for f in fields: if hasattr(f.value, "__call__"): meta.append((f.name, f.value)) else: meta.append((f.name, functools.partial(lambda v, d: d[v], f.value))) for row in self._values(): agg = Data() for name, f in meta: agg[name] = f(row) output.append(agg) return output # meta = [] # for f in fields: # keys = split_field(f.value) # depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX # short_key = join_field(keys[depth:]) # # meta.append((f.name, depth, short_key)) # # for row in self._data: # agg = Data() # for name, depth, short_key in meta: # if short_key: # agg[name] = row[depth][short_key] # else: # agg[name] = row[depth] # output.append(agg) # return output Log.error("multiselect over FlatList not supported")
def get(self, select): """ :param select: the variable to extract from list :return: a simple list of the extraction """ if is_list(select): return [(d[s] for s in select) for d in self.data] else: return [d[select] for d in self.data]
def parse_sql(sql): query = wrap(moz_sql_parser.parse(sql)) redundant_select = [] # PULL OUT THE AGGREGATES for s in listwrap(query.select): val = s if s == '*' else s.value # EXTRACT KNOWN AGGREGATE FUNCTIONS if is_data(val): for a in KNOWN_SQL_AGGREGATES: value = val[a] if value != None: if is_list(value): # AGGREGATE WITH PARAMETERS EG percentile(value, 0.90) s.aggregate = a s[a] = unwraplist(value[1::]) s.value = value[0] else: # SIMPLE AGGREGATE s.aggregate = a s.value = value break # LOOK FOR GROUPBY COLUMN IN SELECT CLAUSE, REMOVE DUPLICATION for g in listwrap(query.groupby): try: assertAlmostEqual(g.value, val, "") g.name = s.name redundant_select.append(s) break except Exception: pass # REMOVE THE REDUNDANT select if is_list(query.select): for r in redundant_select: query.select.remove(r) elif query.select and redundant_select: query.select = None # RENAME orderby TO sort query.sort, query.orderby = query.orderby, None query.format = "table" return query
def _groupby(self, edges): """ RETURNS LIST OF (coord, values) TUPLES, WHERE coord IS THE INDEX INTO self CUBE (-1 INDEX FOR COORDINATES NOT GROUPED BY) values ALL VALUES THAT BELONG TO THE SLICE """ edges = FlatList([n for e in edges for n in _normalize_edge(e)]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[ getKey[i](p) for p in e.domain.partitions + ([None] if e.allowNulls else []) ] for i, e in enumerate(self.edges)] def coord2term(coord): output = wrap_leaves( {keys[i]: lookup[i][c] for i, c in enumerate(coord)}) return output if is_list(self.select): selects = listwrap(self.select) index, v = transpose(*self.data[selects[0].name].groupby(selector)) coord = wrap([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = transpose(*self.data[s.name].group_by(selector)) values.append(v) output = transpose(coord, [ Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values) ]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( (coord2term(coord), v) for coord, v in self.data[self.select.name].groupby(selector)) else: output = ( (coord2term(coord), Cube(self.select, remainder, v)) for coord, v in self.data[self.select.name].groupby(selector)) return output
def _replace_ref(node, url): if url.path.endswith("/"): url.path = url.path[:-1] if is_data(node): ref = None output = {} for k, v in node.items(): if k == "$ref": ref = URL(v) else: output[k] = _replace_ref(v, url) if not ref: return output node = output if not ref.scheme and not ref.path: # DO NOT TOUCH LOCAL REF YET output["$ref"] = ref return output if not ref.scheme: # SCHEME RELATIVE IMPLIES SAME PROTOCOL AS LAST TIME, WHICH # REQUIRES THE CURRENT DOCUMENT'S SCHEME ref.scheme = url.scheme # FIND THE SCHEME AND LOAD IT if ref.scheme in scheme_loaders: new_value = scheme_loaders[ref.scheme](ref, url) else: raise Log.error("unknown protocol {{scheme}}", scheme=ref.scheme) if ref.fragment: new_value = mo_dots.get_attr(new_value, ref.fragment) DEBUG and Log.note("Replace {{ref}} with {{new_value}}", ref=ref, new_value=new_value) if not output: output = new_value elif is_text(output): Log.error("Can not handle set_default({{output}},{{new_value}})", output=output, new_value=new_value) else: output = unwrap(set_default(output, new_value)) DEBUG and Log.note("Return {{output}}", output=output) return output elif is_list(node): output = [_replace_ref(n, url) for n in node] # if all(p[0] is p[1] for p in zip(output, node)): # return node return output return node
def _normalize_selects(selects, frum, schema=None, ): if frum == None or isinstance(frum, (list, set, text_type)): if is_list(selects): if len(selects) == 0: return Null else: output = [_normalize_select_no_context(s, schema=schema) for s in selects] else: return _normalize_select_no_context(selects, schema=schema) elif is_list(selects): output = [ss for s in selects for ss in _normalize_select(s, frum=frum, schema=schema)] else: output = _normalize_select(selects, frum, schema=schema) exists = set() for s in output: if s.name in exists: Log.error("{{name}} has already been defined", name=s.name) exists.add(s.name) return output
def groupby(self, edges): """ SLICE THIS CUBE IN TO ONES WITH LESS DIMENSIONALITY simple==True WILL HAVE GROUPS BASED ON PARTITION VALUE, NOT PARTITION OBJECTS """ edges = FlatList([n for e in edges for n in _normalize_edge(e)]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[ getKey[i](p) for p in e.domain.partitions + ([None] if e.allowNulls else []) ] for i, e in enumerate(self.edges)] def coord2term(coord): output = wrap_leaves( {keys[i]: lookup[i][c] for i, c in enumerate(coord)}) return output if is_list(self.select): selects = listwrap(self.select) index, v = transpose(*self.data[selects[0].name].groupby(selector)) coord = wrap([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = zip(*self.data[s.name].group_by(selector)) values.append(v) output = transpose(coord, [ Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values) ]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( (coord2term(coord), v) for coord, v in self.data[self.select.name].groupby(selector)) else: output = ( (coord2term(coord), Cube(self.select, remainder, v)) for coord, v in self.data[self.select.name].groupby(selector)) return output
def groupby(self, edges): """ SLICE THIS CUBE IN TO ONES WITH LESS DIMENSIONALITY simple==True WILL HAVE GROUPS BASED ON PARTITION VALUE, NOT PARTITION OBJECTS """ edges = FlatList([n for e in edges for n in _normalize_edge(e) ]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[getKey[i](p) for p in e.domain.partitions+([None] if e.allowNulls else [])] for i, e in enumerate(self.edges)] def coord2term(coord): output = wrap_leaves({keys[i]: lookup[i][c] for i, c in enumerate(coord)}) return output if is_list(self.select): selects = listwrap(self.select) index, v = transpose(*self.data[selects[0].name].groupby(selector)) coord = wrap([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = zip(*self.data[s.name].group_by(selector)) values.append(v) output = transpose(coord, [Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values)]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( ( coord2term(coord), v ) for coord, v in self.data[self.select.name].groupby(selector) ) else: output = ( ( coord2term(coord), Cube(self.select, remainder, v) ) for coord, v in self.data[self.select.name].groupby(selector) ) return output
def _where_terms(master, where, schema): """ USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS) """ if is_data(where): if where.term: # MAP TERM try: output = _map_term_using_schema(master, [], where.term, schema.edges) return output except Exception as e: Log.error("programmer problem?", e) elif where.terms: # MAP TERM output = FlatList() for k, v in where.terms.items(): if not is_container(v): Log.error("terms filter expects list of values") edge = schema.edges[k] if not edge: output.append({"terms": {k: v}}) else: if is_text(edge): # DIRECT FIELD REFERENCE return {"terms": {edge: v}} try: domain = edge.getDomain() except Exception as e: Log.error("programmer error", e) fields = domain.dimension.fields if is_data(fields): or_agg = [] for vv in v: and_agg = [] for local_field, es_field in fields.items(): vvv = vv[local_field] if vvv != None: and_agg.append({"term": {es_field: vvv}}) or_agg.append({"and": and_agg}) output.append({"or": or_agg}) elif is_list(fields) and len(fields) == 1 and is_variable_name(fields[0]): output.append({"terms": {fields[0]: v}}) elif domain.partitions: output.append({"or": [domain.getPartByKey(vv).esfilter for vv in v]}) return {"and": output} elif where["or"]: return {"or": [unwrap(_where_terms(master, vv, schema)) for vv in where["or"]]} elif where["and"]: return {"and": [unwrap(_where_terms(master, vv, schema)) for vv in where["and"]]} elif where["not"]: return {"not": unwrap(_where_terms(master, where["not"], schema))} return where
def is_type(value, type): if value == None: return False elif is_text(value) and type == "string": return value elif is_list(value): return False elif is_data(value) and type == "object": return True elif isinstance(value, (int, float, Date)) and type == "number": return True return False
def _replace_locals(node, doc_path): if is_data(node): # RECURS, DEEP COPY ref = None output = {} for k, v in node.items(): if k == "$ref": ref = v elif k == "$concat": if not is_sequence(v): Log.error("$concat expects an array of strings") return coalesce(node.get("separator"), "").join(v) elif v == None: continue else: output[k] = _replace_locals(v, [v] + doc_path) if not ref: return output # REFER TO SELF frag = ref.fragment if frag[0] == ".": # RELATIVE for i, p in enumerate(frag): if p != ".": if i > len(doc_path): Log.error( "{{frag|quote}} reaches up past the root document", frag=frag) new_value = get_attr(doc_path[i - 1], frag[i::]) break else: new_value = doc_path[len(frag) - 1] else: # ABSOLUTE new_value = get_attr(doc_path[-1], frag) new_value = _replace_locals(new_value, [new_value] + doc_path) if not output: return new_value # OPTIMIZATION FOR CASE WHEN node IS {} else: return unwrap(set_default(output, new_value)) elif is_list(node): candidate = [_replace_locals(n, [n] + doc_path) for n in node] # if all(p[0] is p[1] for p in zip(candidate, node)): # return node return candidate return node
def jx_expression_to_function(expr): """ RETURN FUNCTION THAT REQUIRES PARAMETERS (row, rownum=None, rows=None): """ if is_expression(expr): if is_op(expr, ScriptOp) and not is_text(expr.script): return expr.script else: return compile_expression(Python[expr].to_python()) if (expr != None and not is_data(expr) and not is_list(expr) and hasattr(expr, "__call__")): return expr return compile_expression(Python[jx_expression(expr)].to_python())
def write(self, data): if not self.parent.exists: self.parent.create() with open(self._filename, "wb") as f: if is_list(data) and self.key: Log.error(u"list of data and keys are not supported, encrypt before sending to file") if is_list(data): pass elif isinstance(data, (binary_type, text_type)): data=[data] elif hasattr(data, "__iter__"): pass for d in data: if not is_text(d): Log.error(u"Expecting unicode data only") if self.key: from mo_math.crypto import encrypt f.write(encrypt(d, self.key).encode("utf8")) else: f.write(d.encode("utf8"))
def __init__(self, select, edges, data, frum=None): """ data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE ALLOWED, USING THE select AND edges TO DESCRIBE THE data """ self.is_value = False if is_list(select) else True self.select = select self.meta = Data(format="cube") # PUT EXTRA MARKUP HERE self.is_none = False if not all(data.values()): is_none = True # ENSURE frum IS PROPER FORM if is_list(select): if edges and OR(not isinstance(v, Matrix) for v in data.values()): Log.error("Expecting data to be a dict with Matrix values") if not edges: if not data: if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix.ZERO} self.edges = Null elif is_data(data): # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA length = MAX([len(v) for v in data.values()]) if length >= 1: self.edges = list_to_data([{"name": "rownum", "domain": {"type": "rownum"}}]) else: self.edges = Null elif is_list(data): if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix.wrap(data)} self.edges = list_to_data([{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}]) elif isinstance(data, Matrix): if is_list(select): Log.error("not expecting a list of records") data = {select.name: data} else: if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix(value=data)} self.edges = Null else: self.edges = to_data(edges) self.data = data
def __init__(self, select, edges, data, frum=None): """ data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE ALLOWED, USING THE select AND edges TO DESCRIBE THE data """ self.is_value = False if is_list(select) else True self.select = select self.meta = Data(format="cube") # PUT EXTRA MARKUP HERE self.is_none = False if not all(data.values()): is_none = True # ENSURE frum IS PROPER FORM if is_list(select): if edges and OR(not isinstance(v, Matrix) for v in data.values()): Log.error("Expecting data to be a dict with Matrix values") if not edges: if not data: if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix.ZERO} self.edges = FlatList.EMPTY elif is_data(data): # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA length = MAX([len(v) for v in data.values()]) if length >= 1: self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum"}}]) else: self.edges = FlatList.EMPTY elif is_list(data): if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix.wrap(data)} self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}]) elif isinstance(data, Matrix): if is_list(select): Log.error("not expecting a list of records") data = {select.name: data} else: if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix(value=data)} self.edges = FlatList.EMPTY else: self.edges = wrap(edges) self.data = data
def _groupby(self, edges): """ RETURNS LIST OF (coord, values) TUPLES, WHERE coord IS THE INDEX INTO self CUBE (-1 INDEX FOR COORDINATES NOT GROUPED BY) values ALL VALUES THAT BELONG TO THE SLICE """ edges = FlatList([n for e in edges for n in _normalize_edge(e)]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[getKey[i](p) for p in e.domain.partitions+([None] if e.allowNulls else [])] for i, e in enumerate(self.edges)] if is_list(self.select): selects = listwrap(self.select) index, v = transpose(*self.data[selects[0].name].groupby(selector)) coord = wrap([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = transpose(*self.data[s.name].group_by(selector)) values.append(v) output = transpose(coord, [Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values)]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( ( coord2term(coord), v ) for coord, v in self.data[self.select.name].groupby(selector) ) else: output = ( ( coord2term(coord), Cube(self.select, remainder, v) ) for coord, v in self.data[self.select.name].groupby(selector) ) return output
def url_param2value(param): """ CONVERT URL QUERY PARAMETERS INTO DICT """ if param == None: return Null if param == None: return Null def _decode(v): output = [] i = 0 while i < len(v): c = v[i] if c == "%": d = hex2chr(v[i + 1:i + 3]) output.append(d) i += 3 else: output.append(c) i += 1 output = text("".join(output)) try: from mo_json import json2value return json2value(output) except Exception: pass return output query = Data() for p in param.split('&'): if not p: continue if p.find("=") == -1: k = p v = True else: k, v = p.split("=") v = _decode(v) u = query.get(k) if u is None: query[k] = v elif is_list(u): u += [v] else: query[k] = [u, v] return query
def get_type(v): if v == None: return None elif isinstance(v, bool): return BOOLEAN elif is_text(v): return STRING elif is_data(v): return OBJECT elif isinstance(v, (int, float, Date)): return NUMBER elif is_list(v): return NESTED return None
def url_param2value(param): """ CONVERT URL QUERY PARAMETERS INTO DICT """ if param == None: return Null if param == None: return Null def _decode(v): output = [] i = 0 while i < len(v): c = v[i] if c == "%": d = hex2chr(v[i + 1:i + 3]) output.append(d) i += 3 else: output.append(c) i += 1 output = text_type("".join(output)) try: return json2value(output) except Exception: pass return output query = Data() for p in param.split('&'): if not p: continue if p.find("=") == -1: k = p v = True else: k, v = p.split("=") v = _decode(v) u = query.get(k) if u is None: query[k] = v elif is_list(u): u += [v] else: query[k] = [u, v] return query
def _select1(data, field, depth, output): """ SELECT A SINGLE FIELD """ for d in data: for i, f in enumerate(field[depth:]): d = d[f] if d == None: output.append(None) break elif is_list(d): _select1(d, field, i + 1, output) break else: output.append(d)
def _tuple_deep(v, field, depth, record): """ field = {"name":name, "value":["attribute", "path"]} r[field.name]=v[field.value], BUT WE MUST DEAL WITH POSSIBLE LIST IN field.value PATH """ if hasattr(field.value, "__call__"): return 0, None, record + (field.value(v),) for i, f in enumerate(field.value[depth : len(field.value) - 1 :]): v = v.get(f) if is_list(v): return depth + i + 1, v, record f = field.value.last() return 0, None, record + (v.get(f),)
def _convert_domain(self, domain=None): if not domain: return Domain(type="default") elif isinstance(domain, Dimension): return domain.getDomain() elif isinstance(domain, Domain): return domain if not domain.name: domain = domain.copy() domain.name = domain.type if not is_list(domain.partitions): domain.partitions = list(domain.partitions) return Domain(**domain)
def _replace_locals(node, doc_path): if is_data(node): # RECURS, DEEP COPY ref = None output = {} for k, v in node.items(): if k == "$ref": ref = v elif v == None: continue else: output[k] = _replace_locals(v, [v] + doc_path) if not ref: return output # REFER TO SELF frag = ref.fragment if frag[0] == ".": # RELATIVE for i, p in enumerate(frag): if p != ".": if i>len(doc_path): Log.error("{{frag|quote}} reaches up past the root document", frag=frag) new_value = mo_dots.get_attr(doc_path[i-1], frag[i::]) break else: new_value = doc_path[len(frag) - 1] else: # ABSOLUTE new_value = mo_dots.get_attr(doc_path[-1], frag) new_value = _replace_locals(new_value, [new_value] + doc_path) if not output: return new_value # OPTIMIZATION FOR CASE WHEN node IS {} else: return unwrap(set_default(output, new_value)) elif is_list(node): candidate = [_replace_locals(n, [n] + doc_path) for n in node] # if all(p[0] is p[1] for p in zip(candidate, node)): # return node return candidate return node
def quote_column(column_name, table=None): if column_name == None: Log.error("missing column_name") elif is_text(column_name): if table: return join_column(table, column_name) else: return SQL("`" + '`.`'.join(split_field(column_name)) + "`") # MYSQL QUOTE OF COLUMN NAMES elif is_binary(column_name): return quote_column(column_name.decode('utf8'), table) elif is_list(column_name): if table: return sql_list(join_column(table, c) for c in column_name) return sql_list(quote_column(c) for c in column_name) else: # ASSUME {"name":name, "value":value} FORM return SQL(sql_alias(column_name.value, quote_column(column_name.name)))
def jx_expression_to_function(expr): """ RETURN FUNCTION THAT REQUIRES PARAMETERS (row, rownum=None, rows=None): """ if is_expression(expr): if is_op(expr, ScriptOp) and not is_text(expr.script): return expr.script else: return compile_expression(Python[expr].to_python()) if ( expr != None and not is_data(expr) and not is_list(expr) and hasattr(expr, "__call__") ): return expr return compile_expression(Python[jx_expression(expr)].to_python())
def map(self, map_): def map_select(s, map_): return set_default( {"value": s.value.map(map_)}, s ) def map_edge(e, map_): partitions = unwraplist([ set_default( {"where": p.where.map(map_)}, p ) for p in e.domain.partitions ]) domain = copy(e.domain) domain.where = e.domain.where.map(map_) domain.partitions = partitions edge = copy(e) edge.value = e.value.map(map_) edge.domain = domain if e.range: edge.range.min = e.range.min.map(map_) edge.range.max = e.range.max.map(map_) return edge if is_list(self.select): select = wrap([map_select(s, map_) for s in self.select]) else: select = map_select(self.select, map_) return QueryOp( frum=self.frum.map(map_), select=select, edges=wrap([map_edge(e, map_) for e in self.edges]), groupby=wrap([g.map(map_) for g in self.groupby]), window=wrap([w.map(map_) for w in self.window]), where=self.where.map(map_), sort=wrap([map_select(s, map_) for s in listwrap(self.sort)]), limit=self.limit, format=self.format )
def command_loop(local): DEBUG and Log.note("mo-python process running with {{config|json}}", config=local['config']) while not please_stop: line = sys.stdin.readline() try: command = json2value(line.decode('utf8')) DEBUG and Log.note("got {{command}}", command=command) if "import" in command: dummy={} if is_text(command['import']): exec ("from " + command['import'] + " import *", dummy, context) else: exec ("from " + command['import']['from'] + " import " + ",".join(listwrap(command['import']['vars'])), dummy, context) STDOUT.write(DONE) elif "set" in command: for k, v in command.set.items(): context[k] = v STDOUT.write(DONE) elif "get" in command: STDOUT.write(value2json({"out": coalesce(local.get(command['get']), context.get(command['get']))})) STDOUT.write('\n') elif "stop" in command: STDOUT.write(DONE) please_stop.go() elif "exec" in command: if not is_text(command['exec']): Log.error("exec expects only text") exec (command['exec'], context, local) STDOUT.write(DONE) else: for k, v in command.items(): if is_list(v): exec ("_return = " + k + "(" + ",".join(map(value2json, v)) + ")", context, local) else: exec ("_return = " + k + "(" + ",".join(kk + "=" + value2json(vv) for kk, vv in v.items()) + ")", context, local) STDOUT.write(value2json({"out": local['_return']})) STDOUT.write('\n') except Exception as e: STDOUT.write(value2json({"err": e})) STDOUT.write('\n') finally: STDOUT.flush()
def select(data, field_name): """ return list with values from field_name """ if isinstance(data, Cube): return data._select(_normalize_selects(field_name)) if isinstance(data, PartFlatList): return data.select(field_name) if isinstance(data, UniqueIndex): data = ( data._data.values() ) # THE SELECT ROUTINE REQUIRES dicts, NOT Data WHILE ITERATING if is_data(data): return select_one(data, field_name) if is_data(field_name): field_name = wrap(field_name) if field_name.value in ["*", "."]: return data if field_name.value: # SIMPLIFY {"value":value} AS STRING field_name = field_name.value # SIMPLE PYTHON ITERABLE ASSUMED if is_text(field_name): path = split_field(field_name) if len(path) == 1: return FlatList([d[field_name] for d in data]) else: output = FlatList() flat_list._select1(data, path, 0, output) return output elif is_list(field_name): keys = [_select_a_field(wrap(f)) for f in field_name] return _select(Data(), unwrap(data), keys, 0) else: keys = [_select_a_field(field_name)] return _select(Data(), unwrap(data), keys, 0)
def sort(data, fieldnames=None, already_normalized=False): """ PASS A FIELD NAME, OR LIST OF FIELD NAMES, OR LIST OF STRUCTS WITH {"field":field_name, "sort":direction} """ try: if data == None: return Null if not fieldnames: return wrap(sort_using_cmp(data, value_compare)) if already_normalized: formal = fieldnames else: formal = query._normalize_sort(fieldnames) funcs = [(jx_expression_to_function(f.value), f.sort) for f in formal] def comparer(left, right): for func, sort_ in funcs: try: result = value_compare(func(left), func(right), sort_) if result != 0: return result except Exception as e: Log.error("problem with compare", e) return 0 if is_list(data): output = FlatList([unwrap(d) for d in sort_using_cmp(data, cmp=comparer)]) elif hasattr(data, "__iter__"): output = FlatList( [unwrap(d) for d in sort_using_cmp(list(data), cmp=comparer)] ) else: Log.error("Do not know how to handle") output = None return output except Exception as e: Log.error("Problem sorting\n{{data}}", data=data, cause=e)