def value2url_param(value): """ :param value: :return: ascii URL """ from mo_json import value2json, json2value def _encode(value): return "".join(_map2url[c] for c in value.encode("utf8")) if value == None: return None if is_data(value): value_ = to_data(value) output = "&".join( kk + "=" + vv for k, v in sorted(value_.leaves(), key=lambda p: p[0]) for kk, vv in [(value2url_param(k), value2url_param(v))] if vv or vv == 0 ) elif is_text(value): try: json2value(value) output = _encode(value2json(value)) except Exception: output = _encode(value) elif is_binary(value): output = "".join(_map2url[c] for c in value) elif is_many(value): output = ",".join( vv for v in value for vv in [value2url_param(v)] if vv or vv == 0 ) else: output = _encode(value2json(value)) return output
def test_assign3(self): # IMPOTENT ASSIGNMENTS DO NOTHING a = {} b = to_data(a) b.c = None expected = {} self.assertEqual(a, expected) b.c.d = None expected = {} self.assertEqual(a, expected) b["c.d"] = None expected = {} self.assertEqual(a, expected) b.c.d.e = None expected = {} self.assertEqual(a, expected) b.c["d.e"] = None expected = {} self.assertEqual(a, expected)
def define(cls, expr): expr = to_data(expr) term = expr.select terms = [] if not is_container(term): raise Log.error("Expecting a list") for t in term: if is_text(t): if not is_variable_name(t): Log.error( "expecting {{value}} a simple dot-delimited path name", value=t) terms.append({"name": t, "value": _jx_expression(t, cls.lang)}) elif t.name == None: if t.value == None: Log.error( "expecting select parameters to have name and value properties" ) elif is_text(t.value): if not is_variable_name(t): Log.error( "expecting {{value}} a simple dot-delimited path name", value=t.value, ) else: terms.append({ "name": t.value, "value": _jx_expression(t.value, cls.lang), }) else: Log.error("expecting a name property") else: terms.append({"name": t.name, "value": jx_expression(t.value)}) return (SelectOp(terms))
def metas(self, prefix=None, limit=None, delimiter=None): """ RETURN THE METADATA DESCRIPTORS FOR EACH KEY """ limit = coalesce(limit, TOO_MANY_KEYS) keys = self.bucket.list(prefix=str(prefix), delimiter=str(delimiter)) prefix_len = len(prefix) output = [] for i, k in enumerate( k for k in keys if len(k.key) == prefix_len or k.key[prefix_len] in [".", ":"] ): output.append( { "key": strip_extension(k.key), "etag": convert.quote2string(k.etag), "expiry_date": Date(k.expiry_date), "last_modified": Date(k.last_modified), } ) if i >= limit: break return to_data(output)
def DataClass(name, columns, constraint=None): """ Use the DataClass to define a class, but with some extra features: 1. restrict the datatype of property 2. restrict if `required`, or if `nulls` are allowed 3. generic constraints on object properties It is expected that this class become a real class (or be removed) in the long term because it is expensive to use and should only be good for verifying program correctness, not user input. :param name: Name of the class we are creating :param columns: Each columns[i] has properties { "name", - (required) name of the property "required", - False if it must be defined (even if None) "nulls", - True if property can be None, or missing "default", - A default value, if none is provided "type" - a Python datatype } :param constraint: a JSON query Expression for extra constraints (return true if all constraints are met) :return: The class that has been created """ columns = to_data([{ "name": c, "required": True, "nulls": False, "type": object } if is_text(c) else c for c in columns]) constraint = { "and": [{ "exists": c.name } for c in columns if not c.nulls and c.default == None] + [constraint] } slots = columns.name required = to_data( filter(lambda c: c.required and c.default == None, columns)).name # nulls = to_data(filter(lambda c: c.nulls, columns)).name defaults = {c.name: coalesce(c.default, None) for c in columns} types = {c.name: coalesce(c.jx_type, object) for c in columns} code = expand_template( """ from __future__ import unicode_literals from mo_future import is_text, is_binary from collections import Mapping meta = None types_ = {{types}} defaults_ = {{defaults}} class {{class_name}}(Mapping): __slots__ = {{slots}} def _constraint(row, rownum, rows): code = {{constraint_expr|quote}} if {{constraint_expr}}: return Log.error( "constraint\\n{" + "{code}}\\nnot satisfied {" + "{expect}}\\n{" + "{value|indent}}", code={{constraint_expr|quote}}, expect={{constraint}}, value=row ) def __init__(self, **kwargs): if not kwargs: return for s in {{slots}}: object.__setattr__(self, s, kwargs.get(s, {{defaults}}.get(s, None))) missed = {{required}}-set(kwargs.keys()) if missed: Log.error("Expecting properties {"+"{missed}}", missed=missed) illegal = set(kwargs.keys())-set({{slots}}) if illegal: Log.error("{"+"{names}} are not a valid properties", names=illegal) self._constraint(0, [self]) def __getitem__(self, item): return getattr(self, item) def __setitem__(self, item, value): setattr(self, item, value) return self def __setattr__(self, item, value): if item not in {{slots}}: Log.error("{"+"{item|quote}} not valid attribute", item=item) if value==None and item in {{required}}: Log.error("Expecting property {"+"{item}}", item=item) object.__setattr__(self, item, value) self._constraint(0, [self]) def __getattr__(self, item): Log.error("{"+"{item|quote}} not valid attribute", item=item) def __hash__(self): return object.__hash__(self) def __eq__(self, other): if isinstance(other, {{class_name}}) and dict(self)==dict(other) and self is not other: Log.error("expecting to be same object") return self is other def __dict__(self): return {k: getattr(self, k) for k in {{slots}}} def items(self): return ((k, getattr(self, k)) for k in {{slots}}) def __copy__(self): _set = object.__setattr__ output = object.__new__({{class_name}}) {{assign}} return output def __iter__(self): return {{slots}}.__iter__() def __len__(self): return {{len_slots}} def __str__(self): return str({{dict}}) """, { "class_name": name, "slots": "(" + ", ".join(quote(s) for s in slots) + ")", "required": "{" + ", ".join(quote(s) for s in required) + "}", "defaults": Literal(defaults).to_python(), "len_slots": len(slots), "dict": "{" + ", ".join(quote(s) + ": self." + s for s in slots) + "}", "assign": "; ".join("_set(output, " + quote(s) + ", self." + s + ")" for s in slots), "types": "{" + ",".join(quote(k) + ": " + v.__name__ for k, v in types.items()) + "}", "constraint_expr": jx_expression(not ENABLE_CONSTRAINTS or constraint).to_python(), "constraint": value2json(constraint), }, ) output = _exec(code, name) register_data(output) return output
def set(constants): """ REACH INTO THE MODULES AND OBJECTS TO SET CONSTANTS. THINK OF THIS AS PRIMITIVE DEPENDENCY INJECTION FOR MODULES. USEFUL FOR SETTING DEBUG FLAGS. """ if not constants: return constants = to_data(constants) for full_path, new_value in constants.leaves(): errors = [] k_path = split_field(full_path) if len(k_path) < 2: from mo_logs import Log Log.error( "expecting <module>.<constant> format, not {{path|quote}}", path=k_path) name = k_path[-1] try: mo_dots_set_attr(sys.modules, k_path, new_value) continue except Exception as e: errors.append(e) # ONE MODULE IS MISSING, THE CALLING MODULE try: caller_globals = sys._getframe(1).f_globals caller_file = caller_globals["__file__"] if not caller_file.endswith(".py"): raise Exception("do not know how to handle non-python caller") caller_module = caller_file[:-3].replace("\\", "/") module_path = caller_module.split("/") # ENSURE THERE IS SOME EVIDENCE THE MODULE MATCHES THE PATH if k_path[-2] != module_path[-1]: continue old_value = mo_dots_set_attr(caller_globals, [name], new_value) if DEBUG: from mo_logs import Log Log.note( "Changed {{module}}[{{attribute}}] from {{old_value}} to" " {{new_value}}", module=caller_module, attribute=name, old_value=old_value, new_value=new_value, ) break except Exception as e: errors.append(e) if errors: from mo_logs import Log Log.error("Can not set constant {{path}}", path=full_path, cause=errors)
def add(self, message): message = to_data(message) m = Message() m.set_body(value2json(message)) self.queue.write(m)
from mo_math import is_nan from mo_times import Date, YEAR, WEEK, MONTH from pandas import DataFrame from utils import nice_ceiling # PROVINCE = 7 # Ontario # PROVINCE = 10 # Alberta PROVINCE = 11 # British Columbia Log.start(trace=True) http.DEBUG = True http.default_headers = to_data({ "From": "*****@*****.**", "Referer": "https://github.com/klahnakoski/mo-statcan", "User-Agent": "mo-statscan", "Accept": mimetype.ANY, }) # LESS DETAILED CAUSES CAUSE_OF_DEATH = ( 13_10_0394 # https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310039401 ) # DETAILED CAUSES GROUPED_CAUSE_DEATH = ( 13_10_0392 # https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310039201 ) # WEEKLY_DEATHS = (
def pe_filter(filter, data, depth): """ PARTIAL EVALUATE THE filter BASED ON data GIVEN """ if filter is TRUE: return True if filter is FALSE: return False filter = to_data(filter) if filter["and"]: result = True output = FlatList() for a in filter["and"]: f = pe_filter(a, data, depth) if f is False: result = False elif f is not True: output.append(f) if result and output: return {"and": output} else: return result elif filter["or"]: output = FlatList() for o in filter["or"]: f = pe_filter(o, data, depth) if f is True: return True elif f is not False: output.append(f) if output: return {"or": output} else: return False elif filter["not"]: f = pe_filter(filter["not"], data, depth) if f is True: return False elif f is False: return True else: return {"not": f} elif filter.term or filter.eq: eq = coalesce(filter.term, filter.eq) result = True output = {} for col, val in eq.items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d != val: result = False else: output[rest] = val if result and output: return {"term": output} else: return result elif filter.equal: a, b = filter["equal"] first_a, rest_a = parse_field(a, data, depth) first_b, rest_b = parse_field(b, data, depth) val_a = data[first_a] val_b = data[first_b] if not rest_a: if not rest_b: if val_a != val_b: return False else: return True else: return {"term": {rest_b: val_a}} else: if not rest_b: return {"term": {rest_a: val_b}} else: return {"equal": [rest_a, rest_b]} elif filter.terms: result = True output = {} for col, vals in filter["terms"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d not in vals: result = False else: output[rest] = vals if result and output: return {"terms": output} else: return result elif filter.range: result = True output = {} for col, ranges in filter["range"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: for sign, val in ranges.items(): if sign in ("gt", ">") and d <= val: result = False if sign == "gte" and d < val: result = False if sign == "lte" and d > val: result = False if sign == "lt" and d >= val: result = False else: output[rest] = ranges if result and output: return {"range": output} else: return result elif filter.missing: if is_text(filter.missing): field = filter["missing"] else: field = filter["missing"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d == None: return True return False else: return {"missing": rest} elif filter.prefix: result = True output = {} for col, val in filter["prefix"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d == None or not d.startswith(val): result = False else: output[rest] = val if result and output: return {"prefix": output} else: return result elif filter.exists: if is_text(filter["exists"]): field = filter["exists"] else: field = filter["exists"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d != None: return True return False else: return {"exists": rest} else: Log.error("Can not interpret esfilter: {{esfilter}}", {"esfilter": filter})
def run(query, container=Null): """ THIS FUNCTION IS SIMPLY SWITCHING BASED ON THE query["from"] CONTAINER, BUT IT IS ALSO PROCESSING A list CONTAINER; SEPARATE TO A ListContainer """ if container == None: container = to_data(query)["from"] query_op = QueryOp.wrap(query, container=container, namespace=container.schema) else: query_op = QueryOp.wrap(query, container=container, namespace=container.namespace) if container == None: from jx_python.containers.list import DUAL return DUAL.query(query_op) elif isinstance(container, Container): return container.query(query_op) elif is_many(container): container = ListContainer(name=None, data=list(container)) elif isinstance(container, Cube): if is_aggs(query_op): return cube_aggs(container, query_op) elif is_op(container, QueryOp): container = run(container) elif is_data(container): query = container container = query["from"] container = run(QueryOp.wrap(query, container, container.namespace), container) else: Log.error("Do not know how to handle {{type}}", type=container.__class__.__name__) if is_aggs(query_op): container = list_aggs(container, query_op) else: # SETOP if query_op.where is not TRUE: container = filter(container, query_op.where) if query_op.sort: container = sort(container, query_op.sort, already_normalized=True) if query_op.select: container = select(container, query_op.select) if query_op.window: if isinstance(container, Cube): container = list(container.values()) for param in query_op.window: window(container, param) # AT THIS POINT frum IS IN LIST FORMAT, NOW PACKAGE RESULT if query_op.format == "cube": container = list2cube(container) elif query_op.format == "table": container = list2table(container) container.meta.format = "table" else: container = dict_to_data({ "meta": { "format": "list" }, "data": container }) return container
def _output(): for g, v in itertools.groupby(data, get_key): group = Data() for k, gg in zip(keys, g): group[k] = gg yield (group, to_data(list(v)))
def selector(d): output = Data() for n, p in push_and_pull: output[n] = unwraplist(p(to_data(d))) return unwrap(output)
def format_table(aggs, es_query, query, decoders, all_selects): new_edges = to_data(count_dim(aggs, es_query, decoders)) dims = tuple(len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) rank = len(dims) header = tuple(new_edges.name + all_selects.name) name2index = {s.name: i + rank for i, s in enumerate(all_selects)} def data(): is_sent = Matrix(dims=dims) give_me_zeros = query.sort and not query.groupby if give_me_zeros: # WE REQUIRE THE ZEROS FOR SORTING all_coord = is_sent._all_combos() # TRACK THE EXPECTED COMBINATIONS ordered_coord = next(all_coord)[::-1] output = None for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders): if coord != ordered_coord: # output HAS BEEN YIELDED, BUT SET THE DEFAULT VALUES if output is not None: for s in all_selects: i = name2index[s.name] if output[i] is None: output[i] = s.default # WE CAN GET THE SAME coord MANY TIMES, SO ONLY ADVANCE WHEN NOT ordered_coord = next(all_coord)[::-1] while coord != ordered_coord: # HAPPENS WHEN THE coord IS AHEAD OF ordered_coord record = [d.get_value(ordered_coord[i]) for i, d in enumerate(decoders)] + [s.default for s in all_selects] yield record ordered_coord = next(all_coord)[::-1] # coord == missing_coord output = [d.get_value(c) for c, d in zip(coord, decoders)] + [None for s in all_selects] for select in ss: v = select.pull(agg) if v != None: union(output, name2index[select.name], v, select.aggregate) yield output else: last_coord = None # HANG ONTO THE output FOR A BIT WHILE WE FILL THE ELEMENTS output = None for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders): if coord != last_coord: if output: # SET DEFAULTS for i, s in enumerate(all_selects): v = output[rank+i] if v == None: output[rank+i] = s.default yield output output = is_sent[coord] if output == None: output = is_sent[coord] = [d.get_value(c) for c, d in zip(coord, decoders)] + [None for _ in all_selects] last_coord = coord # THIS IS A TRICK! WE WILL UPDATE A ROW THAT WAS ALREADY YIELDED for select in ss: v = select.pull(agg) if v != None: union(output, name2index[select.name], v, select.aggregate) if output: # SET DEFAULTS ON LAST ROW for i, s in enumerate(all_selects): v = output[rank+i] if v == None: output[rank+i] = s.default yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for coord, output in is_sent: if output == None: record = [d.get_value(c) for c, d in zip(coord, decoders)] + [s.default for s in all_selects] yield record return Data( meta={"format": "table"}, header=header, data=list(data()) )
columns=to_data([ Column( name=c, es_index=META_COLUMNS_NAME, es_column=c, es_type="keyword", jx_type=STRING, last_updated=Date.now(), nested_path=ROOT_PATH, ) for c in [ "name", "es_type", "jx_type", "nested_path", "es_column", "es_index", "partitions", ] ] + [ Column( name=c, es_index=META_COLUMNS_NAME, es_column=c, es_type="integer", jx_type=INTEGER, last_updated=Date.now(), nested_path=ROOT_PATH, ) for c in ["count", "cardinality", "multi"] ] + [ Column(name="last_updated", es_index=META_COLUMNS_NAME, es_column="last_updated", es_type="double", jx_type=NUMBER, last_updated=Date.now(), nested_path=ROOT_PATH) ]))
def list_aggs(frum, query): frum = to_data(frum) select = listwrap(query.select) for e in query.edges: if isinstance(e.domain, DefaultDomain): accessor = jx_expression_to_function(e.value) unique_values = set(map(accessor, frum)) if None in unique_values: e.allowNulls = coalesce(e.allowNulls, True) unique_values -= {None} e.domain = SimpleSetDomain(partitions=list(sorted(unique_values))) else: pass s_accessors = [(ss.name, jx_expression_to_function(ss.value)) for ss in select] result = { s.name: Matrix(dims=[ len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges ], zeros=lambda: windows.name2accumulator.get(s.aggregate) (**s)) for s in select } where = jx_expression_to_function(query.where) coord = [None] * len(query.edges) edge_accessor = [(i, make_accessor(e)) for i, e in enumerate(query.edges)] net_new_edge_names = set(to_data(query.edges).name) - UNION( e.value.vars() for e in query.edges) if net_new_edge_names & UNION(ss.value.vars() for ss in select): # s_accessor NEEDS THESE EDGES, SO WE PASS THEM ANYWAY for d in filter(where, frum): d = d.copy() for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] for e, cc in zip(query.edges, c): d[e.name] = e.domain.partitions[cc] val = s_accessor(d, c, frum) acc.add(val) else: # FASTER for d in filter(where, frum): for c, get_matches in edge_accessor: coord[c] = get_matches(d) for s_name, s_accessor in s_accessors: mat = result[s_name] for c in itertools.product(*coord): acc = mat[c] val = s_accessor(d, c, frum) acc.add(val) for s in select: # if s.aggregate == "count": # continue m = result[s.name] for c, var in m.items(): if var != None: m[c] = var.end() from jx_python.containers.cube import Cube output = Cube(select, query.edges, result) return output
def doc_to_column(doc): now = Date.now() try: doc = to_data(untyped(doc)) # I HAVE MANAGED TO MAKE MANY MISTAKES WRITING COLUMNS TO ES. HERE ARE THE FIXES # FIX if not doc.last_updated: doc.last_updated = Date.now() - YEAR # FIX if doc.es_type == None: if doc.jx_type == OBJECT: doc.es_type = "object" else: Log.warning("{{doc}} has no es_type", doc=doc) # FIX if doc.es_type == "nested": doc.multi = 1001 if doc.multi == None: doc.multi = 1 # FIX if doc.es_column.endswith("." + NESTED_TYPE): if doc.jx_type == OBJECT: doc.jx_type = NESTED doc.last_updated = now if doc.es_type == "nested": doc.es_type = "nested" doc.last_updated = now # FIX doc.nested_path = tuple(listwrap(doc.nested_path)) if last(split_field( doc.es_column)) == NESTED_TYPE and doc.es_type != "nested": doc.es_type = "nested" doc.jx_type = NESTED doc.multi = 1001 doc.last_updated = now # FIX expected_nested_path = get_nested_path(doc.es_column) if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.': doc.nested_path = doc.nested_path[:-1] doc.last_updated = now # FIX if untype_path(doc.es_column) == doc.es_column: if doc.nested_path != (".", ): if doc.es_index in {"repo"}: pass else: Log.note("not expected") doc.nested_path = expected_nested_path doc.last_updated = now else: if doc.nested_path != expected_nested_path: doc.nested_path = expected_nested_path doc.last_updated = now # FIX if last(split_field(doc.es_column)) == EXISTS_TYPE: if doc.jx_type != EXISTS: doc.jx_type = EXISTS doc.last_updated = now if doc.cardinality == None: doc.cardinality = 1 doc.last_updated = now # FIX if doc.jx_type in STRUCT: if doc.cardinality not in [0, 1]: doc.cardinality = 1 # DO NOT KNOW IF EXISTS OR NOT doc.last_updated = now return Column(**doc) except Exception as e: try: mark_as_deleted(Column(**doc), now) except Exception: pass return None
def assign(source, destination): destination[name] = field.value(to_data(source)) return 0, None
def __iter__(self): return (to_data(d) for d in self.data)
def drill_filter(esfilter, data): """ PARTIAL EVALUATE THE FILTER BASED ON DATA GIVEN TODO: FIX THIS MONUMENTALLY BAD IDEA """ esfilter = unwrap(esfilter) primary_nested = [] # track if nested, changes if not primary_column = [] # only one path allowed primary_branch = ( [] ) # CONTAINS LISTS OF RECORDS TO ITERATE: constantly changing as we dfs the tree def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception as e: Log.error("{{name}} does not exist", name=fieldname) if is_list(d) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1:]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d]) return fieldname, None def pe_filter(filter, data, depth): """ PARTIAL EVALUATE THE filter BASED ON data GIVEN """ if filter is TRUE: return True if filter is FALSE: return False filter = to_data(filter) if filter["and"]: result = True output = FlatList() for a in filter["and"]: f = pe_filter(a, data, depth) if f is False: result = False elif f is not True: output.append(f) if result and output: return {"and": output} else: return result elif filter["or"]: output = FlatList() for o in filter["or"]: f = pe_filter(o, data, depth) if f is True: return True elif f is not False: output.append(f) if output: return {"or": output} else: return False elif filter["not"]: f = pe_filter(filter["not"], data, depth) if f is True: return False elif f is False: return True else: return {"not": f} elif filter.term or filter.eq: eq = coalesce(filter.term, filter.eq) result = True output = {} for col, val in eq.items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d != val: result = False else: output[rest] = val if result and output: return {"term": output} else: return result elif filter.equal: a, b = filter["equal"] first_a, rest_a = parse_field(a, data, depth) first_b, rest_b = parse_field(b, data, depth) val_a = data[first_a] val_b = data[first_b] if not rest_a: if not rest_b: if val_a != val_b: return False else: return True else: return {"term": {rest_b: val_a}} else: if not rest_b: return {"term": {rest_a: val_b}} else: return {"equal": [rest_a, rest_b]} elif filter.terms: result = True output = {} for col, vals in filter["terms"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d not in vals: result = False else: output[rest] = vals if result and output: return {"terms": output} else: return result elif filter.range: result = True output = {} for col, ranges in filter["range"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: for sign, val in ranges.items(): if sign in ("gt", ">") and d <= val: result = False if sign == "gte" and d < val: result = False if sign == "lte" and d > val: result = False if sign == "lt" and d >= val: result = False else: output[rest] = ranges if result and output: return {"range": output} else: return result elif filter.missing: if is_text(filter.missing): field = filter["missing"] else: field = filter["missing"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d == None: return True return False else: return {"missing": rest} elif filter.prefix: result = True output = {} for col, val in filter["prefix"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d == None or not d.startswith(val): result = False else: output[rest] = val if result and output: return {"prefix": output} else: return result elif filter.exists: if is_text(filter["exists"]): field = filter["exists"] else: field = filter["exists"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d != None: return True return False else: return {"exists": rest} else: Log.error("Can not interpret esfilter: {{esfilter}}", {"esfilter": filter}) output = [] # A LIST OF OBJECTS MAKING THROUGH THE FILTER def main(sequence, esfilter, row, depth): """ RETURN A SEQUENCE OF REFERENCES OF OBJECTS DOWN THE TREE SHORT SEQUENCES MEANS ALL NESTED OBJECTS ARE INCLUDED """ new_filter = pe_filter(esfilter, row, depth) if new_filter is True: seq = list(sequence) seq.append(row) output.append(seq) return elif new_filter is False: return seq = list(sequence) seq.append(row) for d in primary_branch[depth]: main(seq, new_filter, d, depth + 1) # OUTPUT for i, d in enumerate(data): if is_data(d): main([], esfilter, to_data(d), 0) else: Log.error("filter is expecting a dict, not {{type}}", type=d.__class__) # AT THIS POINT THE primary_column[] IS DETERMINED # USE IT TO EXPAND output TO ALL NESTED OBJECTS max = 0 # EVEN THOUGH A ROW CAN HAVE MANY VALUES, WE ONLY NEED UP TO max for i, n in enumerate(primary_nested): if n: max = i + 1 # OUTPUT IS A LIST OF ROWS, # WHERE EACH ROW IS A LIST OF VALUES SEEN DURING A WALK DOWN A PATH IN THE HIERARCHY uniform_output = FlatList() def recurse(row, depth): if depth == max: uniform_output.append(row) else: nested = row[-1][primary_column[depth]] if not nested: # PASSED FILTER, BUT NO CHILDREN, SO ADD NULL CHILDREN for i in range(depth, max): row.append(None) uniform_output.append(row) else: for d in nested: r = list(row) r.append(d) recurse(r, depth + 1) for o in output: recurse(o, 0) if not max: # SIMPLE LIST AS RESULT return list_to_data([unwrap(u[0]) for u in uniform_output]) return PartFlatList(primary_column[0:max], uniform_output)
def select(self, fields): if is_data(fields): fields = fields.value if is_text(fields): # RETURN LIST OF VALUES if len(split_field(fields)) == 1: if self.path[0] == fields: return [d[1] for d in self.data] else: return [d[0][fields] for d in self.data] else: keys = split_field(fields) depth = coalesce( MIN([ i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p ]), len(self.path)) # LENGTH OF COMMON PREFIX short_key = keys[depth:] output = FlatList() _select1((to_data(d[depth]) for d in self.data), short_key, 0, output) return output if is_list(fields): output = FlatList() meta = [] for f in fields: if hasattr(f.value, "__call__"): meta.append((f.name, f.value)) else: meta.append( (f.name, functools.partial(lambda v, d: d[v], f.value))) for row in self._values(): agg = Data() for name, f in meta: agg[name] = f(row) output.append(agg) return output # meta = [] # for f in fields: # keys = split_field(f.value) # depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX # short_key = join_field(keys[depth:]) # # meta.append((f.name, depth, short_key)) # # for row in self._data: # agg = Data() # for name, depth, short_key in meta: # if short_key: # agg[name] = row[depth][short_key] # else: # agg[name] = row[depth] # output.append(agg) # return output Log.error("multiselect over FlatList not supported")
def request(method, url, headers=None, data=None, json=None, zip=None, retry=None, timeout=None, session=None, kwargs=None): """ JUST LIKE requests.request() BUT WITH DEFAULT HEADERS AND FIXES DEMANDS data IS ONE OF: * A JSON-SERIALIZABLE STRUCTURE, OR * LIST OF JSON-SERIALIZABLE STRUCTURES, OR * None :param method: GET, POST, etc :param url: URL :param headers: dict OF HTTP REQUEST HEADERS :param data: BYTES (OR GENERATOR OF BYTES) :param json: JSON-SERIALIZABLE STRUCTURE :param zip: ZIP THE REQUEST BODY, IF BIG ENOUGH :param retry: {"times": x, "sleep": y} STRUCTURE :param timeout: SECONDS TO WAIT FOR RESPONSE :param session: Session OBJECT, IF YOU HAVE ONE :param kwargs: ALL PARAMETERS (DO NOT USE) :return: """ global _warning_sent global request_count if not _warning_sent and not default_headers: Log.warning(text( "The mo_http.http module was meant to add extra " + "default headers to all requests, specifically the 'Referer' " + "header with a URL to the project. Use the `mo_logs.constants.set()` " + "function to set `mo_http.http.default_headers`" )) _warning_sent = True if is_list(url): # TRY MANY URLS failures = [] for remaining, u in countdown(url): try: response = request(url=u, kwargs=kwargs) if mo_math.round(response.status_code, decimal=-2) not in [400, 500]: return response if not remaining: return response except Exception as e: e = Except.wrap(e) failures.append(e) Log.error(u"Tried {{num}} urls", num=len(url), cause=failures) if session: close_after_response = Null else: close_after_response = session = sessions.Session() with closing(close_after_response): if PY2 and is_text(url): # httplib.py WILL **FREAK OUT** IF IT SEES ANY UNICODE url = url.encode('ascii') try: set_default(kwargs, DEFAULTS) # HEADERS headers = unwrap(set_default(headers, session.headers, default_headers)) _to_ascii_dict(headers) # RETRY retry = to_data(retry) if retry == None: retry = set_default({}, DEFAULTS['retry']) elif isinstance(retry, Number): retry = set_default({"times": retry}, DEFAULTS['retry']) elif isinstance(retry.sleep, Duration): retry.sleep = retry.sleep.seconds # JSON if json != None: data = value2json(json).encode('utf8') # ZIP zip = coalesce(zip, DEFAULTS['zip']) set_default(headers, {'Accept-Encoding': 'compress, gzip'}) if zip: if is_sequence(data): compressed = ibytes2icompressed(data) headers['content-encoding'] = 'gzip' data = compressed elif len(coalesce(data)) > 1000: compressed = bytes2zip(data) headers['content-encoding'] = 'gzip' data = compressed except Exception as e: Log.error(u"Request setup failure on {{url}}", url=url, cause=e) errors = [] for r in range(retry.times): if r: Till(seconds=retry.sleep).wait() try: request_count += 1 with Timer( "http {{method|upper}} to {{url}}", param={"method": method, "url": text(url)}, verbose=DEBUG ): return _session_request(session, url=str(url), headers=headers, data=data, json=None, kwargs=kwargs) except Exception as e: e = Except.wrap(e) if retry['http'] and str(url).startswith("https://") and "EOF occurred in violation of protocol" in e: url = URL("http://" + str(url)[8:]) Log.note("Changed {{url}} to http due to SSL EOF violation.", url=str(url)) errors.append(e) if " Read timed out." in errors[0]: Log.error(u"Tried {{times}} times: Timeout failure (timeout was {{timeout}}", timeout=timeout, times=retry.times, cause=errors[0]) else: Log.error(u"Tried {{times}} times: Request failure of {{url}}", url=url, times=retry.times, cause=errors[0])
def _normalize_select(select, frum, schema=None): """ :param select: ONE SELECT COLUMN :param frum: TABLE TO get_columns() :param schema: SCHEMA TO LOOKUP NAMES FOR DEFINITIONS :return: AN ARRAY OF SELECT COLUMNS """ if is_text(select): canonical = select = Data(value=select) else: select = to_data(select) canonical = select.copy() canonical.aggregate = coalesce(canonical_aggregates[select.aggregate].name, select.aggregate, "none") canonical.default = coalesce( select.default, canonical_aggregates[canonical.aggregate].default) if hasattr(unwrap(frum), "_normalize_select"): return frum._normalize_select(canonical) output = [] if len(select) and not select.value: Log.error(BAD_SELECT, select=select) elif not select.value or select.value == ".": output.extend([ set_default( { "name": c.name, "value": jx_expression(c.name, schema=schema) }, canonical) for c in schema.leaves('.') # TOP LEVEL COLUMNS ONLY if len(c.nested_path) == 1 ]) elif is_text(select.value): if select.value.endswith(".*"): canonical.name = coalesce(select.name, ".") value = jx_expression(select[:-2], schema=schema) if not is_op(value, Variable): Log.error("`*` over general expression not supported yet") output.append([ set_default( { "value": LeavesOp(value, prefix=select.prefix), "format": "dict" # MARKUP FOR DECODING }, canonical) for c in frum.get_columns() if c.jx_type not in INTERNAL ]) else: Log.error("do not know what to do") else: canonical.name = coalesce(select.name, select.value, select.aggregate) canonical.value = jx_expression(select.value, schema=schema) output.append(canonical) output = to_data(output) if any(n == None for n in output.name): Log.error("expecting select to have a name: {{select}}", select=select) return output
def __init__(self, select, edges, data, frum=None): """ data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE ALLOWED, USING THE select AND edges TO DESCRIBE THE data """ self.is_value = False if is_list(select) else True self.select = select self.meta = Data(format="cube") # PUT EXTRA MARKUP HERE self.is_none = False if not all(data.values()): is_none = True # ENSURE frum IS PROPER FORM if is_list(select): if edges and OR(not isinstance(v, Matrix) for v in data.values()): Log.error("Expecting data to be a dict with Matrix values") if not edges: if not data: if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix.ZERO} self.edges = Null elif is_data(data): # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA length = MAX([len(v) for v in data.values()]) if length >= 1: self.edges = list_to_data([{ "name": "rownum", "domain": { "type": "rownum" } }]) else: self.edges = Null elif is_list(data): if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix.wrap(data)} self.edges = list_to_data([{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": len(data), "interval": 1 } }]) elif isinstance(data, Matrix): if is_list(select): Log.error("not expecting a list of records") data = {select.name: data} else: if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix(value=data)} self.edges = Null else: self.edges = to_data(edges) self.data = data
def _normalize_edge(edge, dim_index, limit, schema=None): """ :param edge: Not normalized edge :param dim_index: Dimensions are ordered; this is this edge's index into that order :param schema: for context :return: a normalized edge """ if not edge: Log.error("Edge has no value, or expression is empty") elif is_text(edge): if schema: leaves = unwraplist(list(schema.leaves(edge))) if not leaves or is_container(leaves): return [ Data(name=edge, value=jx_expression(edge, schema=schema), allowNulls=True, dim=dim_index, domain=_normalize_domain(None, limit)) ] elif isinstance(leaves, Column): return [ Data(name=edge, value=jx_expression(edge, schema=schema), allowNulls=True, dim=dim_index, domain=_normalize_domain(domain=leaves, limit=limit, schema=schema)) ] elif is_list(leaves.fields) and len(leaves.fields) == 1: return [ Data(name=leaves.name, value=jx_expression(leaves.fields[0], schema=schema), allowNulls=True, dim=dim_index, domain=leaves.getDomain()) ] else: return [ Data(name=leaves.name, allowNulls=True, dim=dim_index, domain=leaves.getDomain()) ] else: return [ Data(name=edge, value=jx_expression(edge, schema=schema), allowNulls=True, dim=dim_index, domain=DefaultDomain()) ] else: edge = to_data(edge) if not edge.name and not is_text(edge.value): Log.error("You must name compound and complex edges: {{edge}}", edge=edge) if is_container(edge.value) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain = _normalize_domain(schema=schema) domain.dimension = Data(fields=edge.value) return [ Data(name=edge.name, value=jx_expression(edge.value, schema=schema), allowNulls=bool(coalesce(edge.allowNulls, True)), dim=dim_index, domain=domain) ] domain = _normalize_domain(edge.domain, schema=schema) return [ Data(name=coalesce(edge.name, edge.value), value=jx_expression(edge.value, schema=schema), range=_normalize_range(edge.range), allowNulls=bool(coalesce(edge.allowNulls, True)), dim=dim_index, domain=domain) ]
def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta=None): show_detail = True test = unwrap(test) expected = unwrap(expected) try: if test is None and (is_null_op(expected) or expected is None): return elif test is expected: return elif is_text(expected): assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta) elif isinstance(test, UniqueIndex): if test ^ expected: Log.error("Sets do not match") elif is_data(expected) and is_data(test): for k, e in unwrap(expected).items(): t = test.get(k) assertAlmostEqual(t, e, msg=coalesce(msg, "") + "key " + quote(k) + ": ", digits=digits, places=places, delta=delta) elif is_data(expected): if is_many(test): test = list(test) if len(test) != 1: Log.error("Expecting data, not a list") test = test[0] for k, e in expected.items(): if is_text(k): t = mo_dots.get_attr(test, literal_field(k)) else: t = test[k] assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta) elif is_container(test) and isinstance(expected, set): test = set(to_data(t) for t in test) if len(test) != len(expected): Log.error( "Sets do not match, element count different:\n{{test|json|indent}}\nexpecting{{expectedtest|json|indent}}", test=test, expected=expected) for e in expected: for t in test: try: assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta) break except Exception as _: pass else: Log.error( "Sets do not match. {{value|json}} not found in {{test|json}}", value=e, test=test) elif isinstance(expected, types.FunctionType): return expected(test) elif hasattr(test, "__iter__") and hasattr(expected, "__iter__"): if test.__class__.__name__ == "ndarray": # numpy test = test.tolist() elif test.__class__.__name__ == "DataFrame": # pandas test = test[test.columns[0]].values.tolist() elif test.__class__.__name__ == "Series": # pandas test = test.values.tolist() if not expected and test == None: return if expected == None: expected = [] # REPRESENT NOTHING for t, e in zip_longest(test, expected): assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta) else: assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta) except Exception as e: Log.error( "{{test|json|limit(10000)}} does not match expected {{expected|json|limit(10000)}}", test=test if show_detail else "[can not show]", expected=expected if show_detail else "[can not show]", cause=e)
def _normalize_group(edge, dim_index, limit, schema=None): """ :param edge: Not normalized groupby :param dim_index: Dimensions are ordered; this is this groupby's index into that order :param schema: for context :return: a normalized groupby """ if is_text(edge): if edge.endswith(".*"): prefix = edge[:-2] if schema: output = list_to_data([ { # BECASUE THIS IS A GROUPBY, EARLY SPLIT INTO LEAVES WORKS JUST FINE "name": concat_field( prefix, literal_field( relative_field(untype_path(c.name), prefix))), "put": { "name": literal_field(untype_path(c.name)) }, "value": jx_expression(c.es_column, schema=schema), "allowNulls": True, "domain": { "type": "default" } } for c in schema.leaves(prefix) ]) return output else: return list_to_data([{ "name": untype_path(prefix), "put": { "name": literal_field(untype_path(prefix)) }, "value": LeavesOp(Variable(prefix)), "allowNulls": True, "dim": dim_index, "domain": { "type": "default" } }]) return list_to_data([{ "name": edge, "value": jx_expression(edge, schema=schema), "allowNulls": True, "dim": dim_index, "domain": Domain(type="default", limit=limit) }]) else: edge = to_data(edge) if (edge.domain and edge.domain.type != "default"): Log.error("groupby does not accept complicated domains") if not edge.name and not is_text(edge.value): Log.error("You must name compound edges: {{edge}}", edge=edge) return list_to_data([{ "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value, schema=schema), "allowNulls": True, "dim": dim_index, "domain": { "type": "default" } }])
def _add_pending(self, delta): delta = to_data(delta) self.pending.append(delta)
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = to_data(command) table = self.get_table(command['update']) es_index = self.es.cluster.get_index(read_only=False, alias=None, kwargs=self.es.settings) schema = table.schema # GET IDS OF DOCUMENTS query = { "from": command['update'], "select": [{ "value": "_id" }] + [{ "name": k, "value": v } for k, v in command.set.items()], "where": command.where, "format": "list", "limit": 10000 } results = self.query(query) if results.data: content = "".join(t for r in results.data for _id, row in [(r._id, r)] for _ in [row.__setitem__('_id', None) ] # WARNING! DESTRUCTIVE TO row for update in map(value2json, ({ "update": { "_id": _id } }, { "doc": row })) for t in (update, "\n")) response = self.es.cluster.post( es_index.path + "/" + "_bulk", data=content, timeout=self.settings.timeout, params={ "wait_for_active_shards": self.settings.wait_for_active_shards }) if response.errors: Log.error("could not update: {{error}}", error=[ e.error for i in response["items"] for e in i.values() if e.status not in (200, 201) ]) # DELETE BY QUERY, IF NEEDED if "." in listwrap(command['clear']): es_filter = ES52Lang[jx_expression( command.where)].partial_eval().to_es(schema) self.es.delete_record(es_filter) return es_index.refresh()
def es_bulkaggsop(esq, frum, query): # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html#_filtering_values_with_partitions query = query.copy() # WE WILL MARK UP THIS QUERY chunk_size = min(coalesce(query.chunk_size, MAX_CHUNK_SIZE), MAX_CHUNK_SIZE) schema = frum.schema query_path = first(schema.query_path) selects = listwrap(query.select) variable = first(query.groupby).value # FIND CARDINALITY cardinality_check = Timer("Get cardinality for {{column}}", param={"column": variable.var}) with cardinality_check: columns = schema.leaves(variable.var) if len(columns) != 1: Log.error("too many columns to bulk groupby:\n{{columns|json}}", columns=columns) column = first(columns) if query.where is TRUE: cardinality = column.cardinality if cardinality == None: esq.namespace._update_cardinality(column) cardinality = column.cardinality else: cardinality = esq.query({ "select": { "name": "card", "value": variable, "aggregate": "cardinality", }, "from": frum.name, "where": query.where, "format": "cube", }).card num_partitions = (cardinality + chunk_size - 1) // chunk_size if num_partitions > MAX_PARTITIONS: Log.error("Requesting more than {{num}} partitions", num=num_partitions) if num_partitions == 0: num_partitions = 1 acc, decoders, es_query = aggop_to_es_queries(selects, query_path, schema, query) guid = randoms.base64(32, extra="-_") abs_limit = mo_math.MIN( (query.limit, first(query.groupby).domain.limit)) formatter = formatters[query.format](abs_limit) Thread.run( "extract to " + guid + ".json", extractor, guid, num_partitions, esq, query, selects, query_path, schema, chunk_size, cardinality, abs_limit, formatter, parent_thread=Null, ).release() output = to_data({ "url": URL_PREFIX / (guid + ".json"), "status": URL_PREFIX / (guid + ".status.json"), "meta": { "format": query.format, "timing": { "cardinality_check": cardinality_check.duration }, "es_query": es_query, "num_partitions": num_partitions, "cardinality": cardinality, }, }) return output
def update(self, command): self.dirty = True try: command = to_data(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": d = self.data i = eq.es_index with self.locker: cols = d[i] del d[i] for c in cols: self.remove(c) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all(c[k] == v for k, v in eq.items()) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": mark_as_deleted(col, Date.now()) self.for_es_update.add(col) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.for_es_update.add(col) except Exception as e: Log.error("should not happen", cause=e)