def error( cls, template, # human readable template default_params={}, # parameters for template cause=None, # pausible cause stack_depth=0, **more_params ): """ raise an exception with a trace for the cause too :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if default_params and isinstance(listwrap(default_params)[0], BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) add_to_trace = False cause = wrap(unwraplist([Except.wrap(c, stack_depth=1) for c in listwrap(cause)])) trace = exceptions.extract_stack(stack_depth + 1) if add_to_trace: cause[0].trace.extend(trace[1:]) e = Except(exceptions.ERROR, template, params, cause, trace) raise e
def map(self, map_): def map_select(s, map_): return set_default({"value": s.value.map(map_)}, s) def map_edge(e, map_): partitions = unwraplist([set_default({"where": p.where.map(map_)}, p) for p in e.domain.partitions]) domain = copy(e.domain) domain.where = e.domain.where.map(map_) domain.partitions = partitions edge = copy(e) edge.value = e.value.map(map_) edge.domain = domain if e.range: edge.range.min = e.range.min.map(map_) edge.range.max = e.range.max.map(map_) return edge return QueryOp( "from", frum=self.frum.map(map_), select=wrap([map_select(s, map_) for s in listwrap(self.select)]), edges=wrap([map_edge(e, map_) for e in self.edges]), groupby=wrap([g.map(map_) for g in self.groupby]), window=wrap([w.map(map_) for w in self.window]), where=self.where.map(map_), sort=wrap([map_select(s, map_) for s in listwrap(self.sort)]), limit=self.limit, format=self.format, )
def value_compare(l, r, ordering=1): if l == None: if r == None: return 0 else: return - ordering elif r == None: return ordering if isinstance(l, list) or isinstance(r, list): for a, b in zip(listwrap(l), listwrap(r)): c = value_compare(a, b) * ordering if c != 0: return c return 0 elif isinstance(l, Mapping): if isinstance(r, Mapping): for k in set(l.keys()) | set(r.keys()): c = value_compare(l[k], r[k]) * ordering if c != 0: return c return 0 else: return 1 elif isinstance(r, Mapping): return -1 else: return cmp(l, r) * ordering
def error( cls, template, # human readable template default_params={}, # parameters for template cause=None, # pausible cause stack_depth=0, **more_params ): """ raise an exception with a trace for the cause too """ if default_params and isinstance(listwrap(default_params)[0], BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) add_to_trace = False cause = unwraplist([Except.wrap(c, stack_depth=1) for c in listwrap(cause)]) trace = extract_stack(stack_depth + 1) if add_to_trace: cause[0].trace.extend(trace[1:]) e = Except(ERROR, template, params, cause, trace) raise e
def get_all_vars(query): output = [] for s in listwrap(query.select): output.extend(select_get_all_vars(s)) for s in listwrap(query.edges): output.extend(edges_get_all_vars(s)) for s in listwrap(query.groupby): output.extend(edges_get_all_vars(s)) output.extend(expressions.get_all_vars(query.where)) return output
def argparse(defs): parser = _argparse.ArgumentParser() for d in listwrap(defs): args = d.copy() name = args.name args.name = None parser.add_argument(*unwrap(listwrap(name)), **args) namespace = parser.parse_args() output = {k: getattr(namespace, k) for k in vars(namespace)} return wrap(output)
def _set_op(self, query): if listwrap(query.select)[0].value == ".": selects = ["*"] else: selects = [s.value.to_sql() + " AS " + quote_table(s.name) for s in listwrap(query.select)] for w in query.window: selects.append(self._window_op(self, query, w)) agg = " FROM " + quote_table(self.name) + " a" where = "\nWHERE " + query.where.to_sql() return "SELECT " + (",\n".join(selects)) + agg + where
def errors(e, _buffer): # HANDLE ERRORS FROM extend() if e.cause.cause: not_possible = [f for f in listwrap(e.cause.cause) if "JsonParseException" in f or "400 MapperParsingException" in f] still_have_hope = [f for f in listwrap(e.cause.cause) if "JsonParseException" not in f and "400 MapperParsingException" not in f] else: not_possible = [e] still_have_hope = [] if still_have_hope: Log.warning("Problem with sending to ES", cause=still_have_hope) elif not_possible: # THERE IS NOTHING WE CAN DO Log.warning("Not inserted, will not try again", cause=not_possible[0:10:]) del _buffer[:]
def value_compare(l, r, ordering=1): """ SORT VALUES, NULL IS THE LEAST VALUE :param l: LHS :param r: RHS :param ordering: (-1, 0, 0) TO AFFECT SORT ORDER :return: The return value is negative if x < y, zero if x == y and strictly positive if x > y. """ if l == None: if r == None: return 0 else: return ordering elif r == None: return - ordering if isinstance(l, list) or isinstance(r, list): for a, b in zip(listwrap(l), listwrap(r)): c = value_compare(a, b) * ordering if c != 0: return c if len(l) < len(r): return - ordering elif len(l) > len(r): return ordering else: return 0 elif isinstance(l, builtin_tuple) and isinstance(r, builtin_tuple): for a, b in zip(l, r): c = value_compare(a, b) * ordering if c != 0: return c return 0 elif isinstance(l, Mapping): if isinstance(r, Mapping): for k in sorted(set(l.keys()) | set(r.keys())): c = value_compare(l.get(k), r.get(k)) * ordering if c != 0: return c return 0 else: return 1 elif isinstance(r, Mapping): return -1 else: return cmp(l, r) * ordering
def __getitem__(self, item): for s in listwrap(self.cube.select): if s.name == item: return self.cube.data[item] for i, e in enumerate(self.cube.edges): if e.name == item: return e.domain.partition[self.coord[i]]
def _normalize_groupby(groupby, schema=None): if groupby == None: return None output = wrap([_normalize_group(e, schema=schema) for e in listwrap(groupby)]) if any(o==None for o in output): Log.error("not expected") return output
def es_fieldop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(qb_expression_to_esfilter(query.where)) } } es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = qb_sort_to_es_sort(query.sort) es_query.fields = DictList() source = "fields" for s in select.value: if s == "*": es_query.fields=None source = "_source" elif s == ".": es_query.fields=None source = "_source" elif isinstance(s, basestring) and is_keyword(s): es_query.fields.append(s) elif isinstance(s, list) and es_query.fields is not None: es_query.fields.extend(s) elif isinstance(s, Mapping) and es_query.fields is not None: es_query.fields.extend(s.values()) elif es_query.fields is not None: es_query.fields.append(s) es_query.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort] return extract_rows(es, es_query, source, select, query)
def warning( cls, template, default_params={}, cause=None, stack_depth=0, log_context=None, **more_params ): """ :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if isinstance(default_params, BaseException): cause = default_params default_params = {} if "values" in more_params.keys(): Log.error("Can not handle a logging parameter by name `values`") params = dict(unwrap(default_params), **more_params) cause = unwraplist([Except.wrap(c) for c in listwrap(cause)]) trace = exceptions.extract_stack(stack_depth + 1) e = Except(exceptions.WARNING, template, params, cause, trace) Log.note( "{{error|unicode}}", error=e, log_context=set_default({"context": exceptions.WARNING}, log_context), stack_depth=stack_depth + 1 )
def get_all_vars(query, exclude_where=False): """ :param query: :param exclude_where: Sometimes we do not what to look at the where clause :return: all variables in use by query """ output = [] for s in listwrap(query.select): output.extend(select_get_all_vars(s)) for s in listwrap(query.edges): output.extend(edges_get_all_vars(s)) for s in listwrap(query.groupby): output.extend(edges_get_all_vars(s)) if not exclude_where: output.extend(expressions.get_all_vars(query.where)) return output
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ return list of (keys, values) pairs where group by the set of keys values IS LIST OF ALL data that has those keys contiguous - MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES """ if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) if isinstance(data, Container): return data.groupby(keys) try: keys = listwrap(keys) get_key = jx_expression_to_function(keys) if not contiguous: data = sorted(data, key=get_key) def _output(): for g, v in itertools.groupby(data, get_key): group = Dict() for k, gg in zip(keys, g): group[k] = gg yield (group, wrap(v)) return _output() except Exception, e: Log.error("Problem grouping", e)
def es_countop(es, mvel, query): """ RETURN SINGLE COUNT """ select = listwrap(query.select) FromES = build_es_query(query) for s in select: if is_keyword(s.value): FromES.facets[s.name] = { "terms": { "field": s.value, "size": query.limit, }, "facet_filter":{"exists":{"field":s.value}} } else: # COMPLICATED value IS PROBABLY A SCRIPT, USE IT FromES.facets[s.name] = { "terms": { "script_field": es09.expressions.compile_expression(s.value, query), "size": 200000 } } data = es09.util.post(es, FromES, query.limit) matricies = {} for s in select: matricies[s.name] = Matrix(value=data.hits.facets[s.name].total) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube
def query_get_all_vars(query, exclude_where=False): """ :param query: :param exclude_where: Sometimes we do not what to look at the where clause :return: all variables in use by query """ output = set() for s in listwrap(query.select): output |= select_get_all_vars(s) for s in listwrap(query.edges): output |= edges_get_all_vars(s) for s in listwrap(query.groupby): output |= edges_get_all_vars(s) if not exclude_where: output |= qb_expression(query.where).vars() return output
def split_expression_by_depth(where, schema, map_, output=None, var_to_depth=None): """ It is unfortunate that ES can not handle expressions that span nested indexes. This will split your where clause returning {"and": [filter_depth0, filter_depth1, ...]} """ vars_ = where.vars() if var_to_depth is None: if not vars_: return Null # MAP VARIABLE NAMES TO HOW DEEP THEY ARE var_to_depth = {v: len(listwrap(schema[v].nested_path)) for v in vars_} all_depths = set(var_to_depth.values()) output = wrap([[] for _ in range(MAX(all_depths) + 1)]) else: all_depths = set(var_to_depth[v] for v in vars_) if len(all_depths) == 1: output[list(all_depths)[0]] += [where.map(map_)] elif isinstance(where, AndOp): for a in where.terms: split_expression_by_depth(a, schema, map_, output, var_to_depth) else: Log.error("Can not handle complex where clause") return output
def warning( cls, template, default_params={}, cause=None, stack_depth=0, # stack trace offset (==1 if you do not want to report self) **more_params ): if isinstance(default_params, BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) cause = unwraplist([Except.wrap(c) for c in listwrap(cause)]) trace = extract_stack(stack_depth + 1) e = Except(WARNING, template, params, cause, trace) Log.note( unicode(e), { "warning": {# REDUNDANT INFO "template": template, "params": params, "cause": cause, "trace": trace } }, stack_depth=stack_depth + 1 )
def es_setop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.fields = DictList() es_query.sort = qb_sort_to_es_sort(query.sort) source = "fields" for s in select: if s.value == "*": es_query.fields = None es_query.script_fields = None source = "_source" elif s.value == ".": es_query.fields = None es_query.script_fields = None source = "_source" elif isinstance(s.value, basestring) and is_keyword(s.value): es_query.fields.append(s.value) elif isinstance(s.value, list) and es_query.fields is not None: es_query.fields.extend(s.value) else: es_query.script_fields[literal_field(s.name)] = {"script": qb_expression_to_ruby(s.value)} return extract_rows(es, es_query, source, select, query)
def is_terms(query): select = listwrap(query.select) isSimple = not query.select or AND(aggregates[s.aggregate] in ("none", "count") for s in select) if isSimple: return True return False
def select(self, select): selects = listwrap(select) if not all(isinstance(s.value, Variable) for s in selects): Log.error("selecting on structure, or expressions, not supported yet") if len(selects) == 1 and isinstance(selects[0].value, Variable) and selects[0].value.var == ".": new_schema = self.schema if selects[0].name == ".": return self else: new_schema = None if isinstance(select, list): push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def selector(d): output = Dict() for n, p in push_and_pull: output[n] = p(wrap(d)) return unwrap(output) new_data = map(selector, self.data) else: select_value = jx_expression_to_function(select.value) new_data = map(select_value, self.data) return ListContainer("from "+self.name, data=new_data, schema=new_schema)
def is_aggsop(es, query): es.cluster.get_metadata() if any(map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])) and ( query.edges or query.groupby or any(a != None and a != "none" for a in listwrap(query.select).aggregate) ): return True return False
def _get_schema_from_list(frum, columns, prefix, nested_path): """ SCAN THE LIST FOR COLUMN TYPES """ names = {} for d in frum: for name, value in d.items(): agg_type = names.get(name, "undefined") this_type = _type_to_name[value.__class__] new_type = _merge_type[agg_type][this_type] names[name] = new_type if this_type == "object": _get_schema_from_list([value], columns, prefix + [name], nested_path) elif this_type == "nested": np = listwrap(nested_path) newpath = unwraplist([".".join((np[0], name))]+np) _get_schema_from_list(value, columns, prefix + [name], newpath) for n, t in names.items(): full_name = ".".join(prefix + [n]) column = Column( table=".", name=full_name, abs_name=full_name, type=t, nested_path=nested_path ) columns[column.name] = column
def get_decoders_by_depth(query): """ RETURN A LIST OF DECODER ARRAYS, ONE ARRAY FOR EACH NESTED DEPTH """ schema = query.frum output = DictList() for e in coalesce(query.edges, query.groupby, []): if e.value: e = e.copy() e.value = qb_expression(e.value) vars_ = e.value.vars() for v in vars_: if not schema[v]: Log.error("{{var}} does not exist in schema", var=v) e.value = e.value.map({schema[v].name: schema[v].abs_name for v in vars_}) else: vars_ = e.domain.dimension.fields e.domain.dimension = e.domain.dimension.copy() e.domain.dimension.fields = [schema[v].abs_name for v in vars_] depths = set(len(listwrap(schema[v].nested_path)) for v in vars_) if len(depths) > 1: Log.error("expression {{expr}} spans tables, can not handle", expr=e.value) depth = list(depths)[0] while len(output) <= depth: output.append([]) output[depth].append(AggsDecoder(e, query)) return output
def read_settings(filename=None, defs=None): # READ SETTINGS if filename: settings_file = File(filename) if not settings_file.exists: Log.error("Can not file settings file {{filename}}", { "filename": settings_file.abspath }) settings = ref.get("file:///" + settings_file.abspath) if defs: settings.args = argparse(defs) return settings else: defs = listwrap(defs) defs.append({ "name": ["--settings", "--settings-file", "--settings_file"], "help": "path to JSON file with settings", "type": str, "dest": "filename", "default": "./settings.json", "required": False }) args = argparse(defs) settings = ref.get("file://" + args.filename.replace(os.sep, "/")) settings.args = args return settings
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "range" self.NULL = Null if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") parts = listwrap(self.partitions) for i, p in enumerate(parts): self.min = Math.min(self.min, p.min) self.max = Math.max(self.max, p.max) if p.dataIndex != None and p.dataIndex != i: Log.error("Expecting `dataIndex` to agree with the order of the parts") if p[self.key] == None: Log.error("Expecting all parts to have {{key}} as a property", key=self.key) p.dataIndex = i # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE for p, q in itertools.product(parts, parts): if p.min <= q.min and q.min < p.max: Log.error("partitions overlap!") self.partitions = parts return elif any([self.min == None, self.max == None, self.interval == None]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = wrap([{"min": v, "max": v + self.interval, "dataIndex": i} for i, v in enumerate(frange(self.min, self.max, self.interval))])
def fatal( cls, template, # human readable template default_params={}, # parameters for template cause=None, # pausible cause stack_depth=0, log_context=None, **more_params ): """ SEND TO STDERR :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if default_params and isinstance(listwrap(default_params)[0], BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) cause = unwraplist([Except.wrap(c) for c in listwrap(cause)]) trace = exceptions.extract_stack(stack_depth + 1) e = Except(exceptions.ERROR, template, params, cause, trace) str_e = unicode(e) error_mode = cls.error_mode try: if not error_mode: cls.error_mode = True Log.note( "{{error|unicode}}", error=e, log_context=set_default({"context": exceptions.FATAL}, log_context), stack_depth=stack_depth + 1 ) except Exception: pass cls.error_mode = error_mode sys.stderr.write(str_e.encode('utf8'))
def store_data(path): try: request = flask.request auth = request.headers.get('Authorization') if not auth: # USE PATTERN MATCHING AUTH for c in all_creds: if c.path == path: return store_public_data(path, c) raise Log.error( "No authentication provided. path={{path}} data.length={{length}}", path=path, length=len(request.get_data()), ) try: receiver = Receiver( lookup_credentials, auth, request.url, request.method, content=request.get_data(), content_type=request.headers['Content-Type'], seen_nonce=seen_nonce ) except Exception, e: e = Except.wrap(e) raise Log.error( "Authentication failed. path={{path}} data.length={{length}}\n{{auth|indent}}", path=path, length=len(request.get_data()), auth=auth, cause=e ) permissions = lookup_user(receiver.parsed_header["id"]) if path not in listwrap(permissions.resources): Log.error("{{user}} not allowed access to {{resource}}", user=permissions.hawk.id, resource=path) link, id = submit_data(path, permissions, request.json) response_content = convert.unicode2utf8(convert.value2json({ "link": link, "etl": {"id": id} })) receiver.respond( content=response_content, content_type=RESPONSE_CONTENT_TYPE ) return Response( response_content, status=200, headers={ b'Server-Authorization': receiver.response_header, b'content-type': RESPONSE_CONTENT_TYPE } )
def es_terms(es, mvel, query): """ RETURN LIST OF ALL EDGE QUERIES EVERY FACET IS NAMED <select.name>, <c1>, ... <cN> WHERE <ci> ARE THE ELEMENT COORDINATES WE TRY TO PACK DIMENSIONS INTO THE TERMS TO MINIMIZE THE CROSS-PRODUCT EXPLOSION """ if len(query.edges) == 2: return _es_terms2(es, mvel, query) select = listwrap(query.select) FromES = build_es_query(query) packed_term = compileEdges2Term(mvel, query.edges, wrap([])) for s in select: FromES.facets[s.name] = { "terms": { "field": packed_term.field, "script_field": packed_term.expression, "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(query.where) } term2Parts = packed_term.term2parts data = es09.util.post(es, FromES, query.limit) # GETTING ALL PARTS WILL EXPAND THE EDGES' DOMAINS # BUT HOW TO UNPACK IT FROM THE term FASTER IS UNKNOWN for k, f in data.facets.items(): for t in f.terms: term2Parts(t.term) # NUMBER ALL EDGES FOR qb INDEXING for f, e in enumerate(query.edges): e.index = f if e.domain.type in ["uid", "default"]: # e.domain.partitions = qb.sort(e.domain.partitions, "value") for p, part in enumerate(e.domain.partitions): part.dataIndex = p e.domain.NULL.dataIndex = len(e.domain.partitions) # MAKE CUBE output = {} dims = [len(e.domain.partitions) + (1 if e.allowNulls else 0) for e in query.edges] for s in select: output[s.name] = Matrix(*dims) # FILL CUBE # EXPECTING ONLY SELECT CLAUSE FACETS for facetName, facet in data.facets.items(): for term in facet.terms: term_coord = term2Parts(term.term).dataIndex for s in select: try: output[s.name][term_coord] = term[aggregates[s.aggregate]] except Exception, e: # USUALLY CAUSED BY output[s.name] NOT BEING BIG ENOUGH TO HANDLE NULL COUNTS pass
def groupby(self, edges): """ SLICE THIS CUBE IN TO ONES WITH LESS DIMENSIONALITY simple==True WILL HAVE GROUPS BASED ON PARTITION VALUE, NOT PARTITION OBJECTS """ edges = DictList([_normalize_edge(e) for e in edges]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[getKey[i](p) for p in e.domain.partitions+([None] if e.allowNulls else [])] for i, e in enumerate(self.edges)] def coord2term(coord): output = wrap_dot({keys[i]: lookup[i][c] for i, c in enumerate(coord)}) return output if isinstance(self.select, list): selects = listwrap(self.select) index, v = zip(*self.data[selects[0].name].groupby(selector)) coord = wrap([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = zip(*self.data[s.name].group_by(selector)) values.append(v) output = zip(coord, [Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values)]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( ( coord2term(coord), v ) for coord, v in self.data[self.select.name].groupby(selector) ) else: output = ( ( coord2term(coord), Cube(self.select, remainder, v) ) for coord, v in self.data[self.select.name].groupby(selector) ) return output
def _select(self, select): selects = listwrap(select) is_aggregate = OR(s.aggregate != None and s.aggregate != "none" for s in selects) if is_aggregate: values = { s.name: Matrix(value=self.data[s.value].aggregate(s.aggregate)) for s in selects } return Cube(select, [], values) else: values = {s.name: self.data[s.value] for s in selects} return Cube(select, self.edges, values)
def _groupby(self, edges): """ RETURNS LIST OF (coord, values) TUPLES, WHERE coord IS THE INDEX INTO self CUBE (-1 INDEX FOR COORDINATES NOT GROUPED BY) values ALL VALUES THAT BELONG TO THE SLICE """ edges = DictList([_normalize_edge(e) for e in edges]) stacked = [e for e in self.edges if e.name in edges.name] remainder = [e for e in self.edges if e.name not in edges.name] selector = [1 if e.name in edges.name else 0 for e in self.edges] if len(stacked) + len(remainder) != len(self.edges): Log.error("can not find some edges to group by") # CACHE SOME RESULTS keys = edges.name getKey = [e.domain.getKey for e in self.edges] lookup = [[getKey[i](p) for p in e.domain.partitions+([None] if e.allowNulls else [])] for i, e in enumerate(self.edges)] if isinstance(self.select, list): selects = listwrap(self.select) index, v = zip(*self.data[selects[0].name].groupby(selector)) coord = wrap([coord2term(c) for c in index]) values = [v] for s in selects[1::]: i, v = zip(*self.data[s.name].group_by(selector)) values.append(v) output = zip(coord, [Cube(self.select, remainder, {s.name: v[i] for i, s in enumerate(selects)}) for v in zip(*values)]) elif not remainder: # v IS A VALUE, NO NEED TO WRAP IT IN A Cube output = ( ( coord2term(coord), v ) for coord, v in self.data[self.select.name].groupby(selector) ) else: output = ( ( coord2term(coord), Cube(self.select, remainder, v) ) for coord, v in self.data[self.select.name].groupby(selector) ) return output
def is_deepop(es, query): if query.edges or query.groupby: return False if all(s.aggregate not in (None, "none") for s in listwrap(query.select)): return False if len(split_field(query.frum.name)) > 1: return True # ASSUME IT IS NESTED IF WE ARE ASKING FOR NESTED COLUMNS # vars_ = query_get_all_vars(query) # columns = query.frum.get_columns() # if any(c for c in columns if c.nested_path and c.name in vars_): # return True return False
def compare_to_expected(query, result, expect): query = wrap(query) expect = wrap(expect) if result.meta.format == "table": assertAlmostEqual(set(result.header), set(expect.header)) # MAP FROM expected COLUMN TO result COLUMN mapping = zip(*zip(*filter( lambda v: v[0][1] == v[1][1], itertools.product(enumerate(expect.header), enumerate( result.header))))[1])[0] result.header = [result.header[m] for m in mapping] if result.data: columns = zip(*unwrap(result.data)) result.data = zip(*[columns[m] for m in mapping]) if not query.sort: sort_table(result) sort_table(expect) elif result.meta.format == "list": if query["from"].startswith("meta."): pass else: query = QueryOp.wrap(query) if not query.sort: try: #result.data MAY BE A LIST OF VALUES, NOT OBJECTS data_columns = jx.sort( set(jx.get_columns(result.data, leaves=True)) | set(jx.get_columns(expect.data, leaves=True)), "name") except Exception: data_columns = [{"name": "."}] sort_order = listwrap(coalesce(query.edges, query.groupby)) + data_columns if isinstance(expect.data, list): try: expect.data = jx.sort(expect.data, sort_order.name) except Exception, _: pass if isinstance(result.data, list): try: result.data = jx.sort(result.data, sort_order.name) except Exception, _: pass
def send_email(self, from_address=None, to_address=None, subject=None, text_data=None, html_data=None ): """Sends an email. from_addr is an email address; to_addrs is a list of email adresses. Addresses can be plain (e.g. "*****@*****.**") or with real names (e.g. "John Smith <*****@*****.**>"). text_data and html_data are both strings. You can specify one or both. If you specify both, the email will be sent as a MIME multipart alternative, i.e., the recipient will see the HTML content if his viewer supports it; otherwise he'll see the text content. """ settings = self.settings from_address = coalesce(from_address, settings["from"], settings.from_address) to_address = listwrap(coalesce(to_address, settings.to_address, settings.to_addrs)) if not from_address or not to_address: raise Exception("Both from_addr and to_addrs must be specified") if not text_data and not html_data: raise Exception("Must specify either text_data or html_data") if not html_data: msg = MIMEText(text_data) elif not text_data: msg = MIMEText(html_data, 'html') else: msg = MIMEMultipart('alternative') msg.attach(MIMEText(text_data, 'plain')) msg.attach(MIMEText(html_data, 'html')) msg['Subject'] = coalesce(subject, settings.subject) msg['From'] = from_address msg['To'] = ', '.join(to_address) if self.server: # CALL AS PART OF A SMTP SESSION self.server.sendmail(from_address, to_address, msg.as_string()) else: # CALL AS STAND-ALONE with self: self.server.sendmail(from_address, to_address, msg.as_string())
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False): """ return list of (keys, values) pairs where group by the set of keys values IS LIST OF ALL data that has those keys contiguous - MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES """ if size != None or min_size != None or max_size != None: if size != None: max_size = size return groupby_min_max_size(data, min_size=min_size, max_size=max_size) if isinstance(data, Cube): return data.groupby(keys) keys = listwrap(keys) def get_keys(d): output = Dict() for k in keys: output[k] = d[k] return output if contiguous: try: if not data: return wrap([]) agg = DictList() acc = DictList() curr_key = value2key(keys, data[0]) for d in data: key = value2key(keys, d) if key != curr_key: agg.append((get_keys(acc[0]), acc)) curr_key = key acc = [d] else: acc.append(d) agg.append((get_keys(acc[0]), acc)) return wrap(agg) except Exception, e: Log.error("Problem grouping contiguous values", e)
def insert_new(self, table_name, candidate_key, new_record): candidate_key = listwrap(candidate_key) condition = " AND\n".join([ self.quote_column(k) + "=" + self.quote_value(new_record[k]) if new_record[k] != None else self.quote_column(k) + " IS Null" for k in candidate_key ]) command = "INSERT INTO " + self.quote_column(table_name) + " (" + \ ",".join([self.quote_column(k) for k in new_record.keys()]) + \ ")\n" + \ "SELECT a.* FROM (SELECT " + ",".join([self.quote_value(v) + " " + self.quote_column(k) for k, v in new_record.items()]) + " FROM DUAL) a\n" + \ "LEFT JOIN " + \ "(SELECT 'dummy' exist FROM " + self.quote_column(table_name) + " WHERE " + condition + " LIMIT 1) b ON 1=1 WHERE exist IS Null" self.execute(command, {})
def error( cls, template, # human readable template default_params={}, # parameters for template cause=None, # pausible cause stack_depth=0, **more_params): """ raise an exception with a trace for the cause too :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if default_params and isinstance( listwrap(default_params)[0], BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) add_to_trace = False cause = wrap( unwraplist( [Except.wrap(c, stack_depth=1) for c in listwrap(cause)])) trace = exceptions.extract_stack(stack_depth + 1) if add_to_trace: cause[0].trace.extend(trace[1:]) e = Except(exceptions.ERROR, template, params, cause, trace) raise e
def _normalize_window(window, schema=None): v = window.value try: expr = jx_expression(v) except Exception: expr = ScriptOp("script", v) return Dict( name=coalesce(window.name, window.value), value=expr, edges=[_normalize_edge(e, schema) for e in listwrap(window.edges)], sort=_normalize_sort(window.sort), aggregate=window.aggregate, range=_normalize_range(window.range), where=_normalize_where(window.where, schema=schema))
def _convert_edge(self, edge): dim = self.dimensions[edge.value] if not dim: return edge if len(listwrap(dim.fields)) == 1: #TODO: CHECK IF EDGE DOMAIN AND DIMENSION DOMAIN CONFLICT new_edge = set_default({"value": unwraplist(dim.fields)}, edge) return new_edge new_edge.domain = dim.getDomain() edge = copy(edge) edge.value = None edge.domain = dim.getDomain() return edge
def _edges_op(self, query): selects = [] for s in listwrap(query.select): if s.value == "." and s.aggregate == "count": selects.append("COUNT(1) AS " + quote_table(s.name)) else: selects.append(sql_aggs[s.aggregate] + "(" + jx_expression(s.value).to_sql() + ") AS " + quote_table(s.name)) for w in query.window: selects.append(self._window_op(self, query, w)) agg_prefix = " FROM " agg_suffix = "\n" agg = "" ons = [] groupby = "" groupby_prefix = "\nGROUP BY " for i, e in enumerate(query.edges): edge_alias = "e" + unicode(i) edge_value = e.value.to_sql() value = edge_value for v in e.value.vars(): value = value.replace(quote_table(v), "a." + quote_table(v)) edge_name = quote_table(e.name) selects.append(edge_alias + "." + edge_name + " AS " + edge_name) agg += \ agg_prefix + "(" + \ "SELECT DISTINCT " + edge_value + " AS " + edge_name + " FROM " + quote_table(self.name) + \ ") " + edge_alias + \ agg_suffix agg_prefix = " LEFT JOIN " agg_suffix = " ON 1=1\n" ons.append(edge_alias + "." + edge_name + " = " + value) groupby += groupby_prefix + edge_alias + "." + edge_name groupby_prefix = ",\n" agg += agg_prefix + quote_table( self.name) + " a ON " + " AND ".join(ons) where = "\nWHERE " + query.where.to_sql() return "SELECT " + (",\n".join(selects)) + agg + where + groupby
def fatal( cls, template, # human readable template default_params={}, # parameters for template cause=None, # pausible cause stack_depth=0, # stack trace offset (==1 if you do not want to report self) **more_params): """ SEND TO STDERR """ if default_params and isinstance( listwrap(default_params)[0], BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) if cause == None: cause = [] elif isinstance(cause, list): pass elif isinstance(cause, Except): cause = [cause] else: cause = [ Except(ERROR, unicode(cause), trace=extract_tb(stack_depth)) ] trace = extract_stack(1 + stack_depth) e = Except(ERROR, template, params, cause, trace) str_e = unicode(e) error_mode = cls.error_mode try: if not error_mode: cls.error_mode = True Log.note( str_e, { "error": { "template": template, "params": params, "cause": cause, "trace": trace } }) except Exception, f: pass
def __str__(self): output = self.type + ": " + self.template + "\n" if self.params: output = expand_template(output, self.params) if self.trace: output += indent(format_trace(self.trace)) if self.cause: cause_strings = [] for c in listwrap(self.cause): try: cause_strings.append(unicode(c)) except Exception, e: pass output += "caused by\n\t" + "and caused by\n\t".join(cause_strings)
def __unicode__(self): output = self.type + ": " + self.template + "\n" if self.params: output = expand_template(output, self.params) if self.trace: output += indent(format_trace(self.trace)) if self.cause: cause_strings = [] for c in listwrap(self.cause): with suppress_exception: cause_strings.append(unicode(c)) output += "caused by\n\t" + "and caused by\n\t".join(cause_strings) return output
def update(self, command): """ EXPECTING command == {"set":term, "clear":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS A JSON EXPRESSION FILTER """ command = wrap(command) command_clear = listwrap(command["clear"]) command_set = command.set.items() command_where = jx.get(command.where) for c in self.data: if command_where(c): for k in command_clear: c[k] = None for k, v in command_set: c[k] = v
def is_deep(query): select = listwrap(query.select) if len(select) > 1: return False if aggregates[select[0].aggregate] not in ("none", "count"): return False if len(query.edges) <= 1: return False isDeep = len(split_field( query["from"].name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT if not isDeep: return False # BETTER TO USE TERM QUERY return True
def groupby(self, keys, contiguous=False): try: keys = listwrap(keys) get_key = jx_expression_to_function(keys) if not contiguous: data = sorted(self.data, key=get_key) def _output(): for g, v in itertools.groupby(data, get_key): group = Dict() for k, gg in zip(keys, g): group[k] = gg yield (group, wrap(list(v))) return _output() except Exception, e: Log.error("Problem grouping", e)
def error( cls, template, # human readable template default_params={}, # parameters for template cause=None, # pausible cause stack_depth=0, # stack trace offset (==1 if you do not want to report self) **more_params): """ raise an exception with a trace for the cause too """ if default_params and isinstance( listwrap(default_params)[0], BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) add_to_trace = False if cause == None: cause = [] elif isinstance(cause, list): pass elif isinstance(cause, Except): cause = [cause] else: add_to_trace = True if hasattr(cause, "message") and cause.message: cause = [ Except(ERROR, unicode(cause.message), trace=extract_tb(stack_depth)) ] else: cause = [ Except(ERROR, unicode(cause), trace=extract_tb(stack_depth)) ] trace = extract_stack(1 + stack_depth) if add_to_trace: cause[0].trace.extend(trace[1:]) e = Except(ERROR, template, params, cause, trace) raise e
def post_process(sql): result = self.db.query(sql) for s in listwrap(query.select): if isinstance(s.value, Mapping): for r in result: r[s.name] = {} for k, v in s.value: r[s.name][k] = r[s.name + "." + k] r[s.name + "." + k] = None if isinstance(s.value, list): # REWRITE AS TUPLE for r in result: r[s.name] = tuple(r[s.name + "," + str(i)] for i, ss in enumerate(s.value)) for i, ss in enumerate(s.value): r[s.name + "," + str(i)] = None expand_json(result) return result
def unique_index(data, keys=None, fail_on_dup=True): """ RETURN dict THAT USES KEYS TO INDEX DATA ONLY ONE VALUE ALLOWED PER UNIQUE KEY """ o = UniqueIndex(listwrap(keys), fail_on_dup=fail_on_dup) for d in data: try: o.add(d) except Exception, e: o.add(d) Log.error( "index {{index}} is not unique {{key}} maps to both {{value1}} and {{value2}}", index=keys, key=select([d], keys)[0], value1=o[d], value2=d, cause=e)
def is_setop(es, query): if not any(map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])): return False select = listwrap(query.select) if not query.edges: isDeep = len(split_field(query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT simpleAgg = AND([s.aggregate in ("count", "none") for s in select]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT # NO EDGES IMPLIES SIMPLER QUERIES: EITHER A SET OPERATION, OR RETURN SINGLE AGGREGATE if simpleAgg or isDeep: return True else: isSmooth = AND((e.domain.type in ALGEBRAIC and e.domain.interval == "none") for e in query.edges) if isSmooth: return True return False
def _counting_query(c): if c.nested_path: return { "nested": { "path": listwrap(c.nested_path)[0] # FIRST ONE IS LONGEST }, "aggs": { "_nested": { "cardinality": { "field": c.es_column, "precision_threshold": 10 if c.type in _elasticsearch.ES_NUMERIC_TYPES else 100 } } } } else: return {"cardinality": {"field": c.es_column}}
def _subquery(self, query, isolate=True, stacked=False): if isinstance(query, basestring): return self.db.quote_column(query), None if query.name: # IT WOULD BE SAFER TO WRAP TABLE REFERENCES IN A TYPED OBJECT (Cube, MAYBE?) return self.db.quote_column(query.name), None if query.edges: # RETURN A CUBE sql, post = self._grouped(query, stacked) else: select = listwrap(query.select) if select[0].aggregate != "none": sql, post = self._aggop(query) else: sql, post = self._setop(query) if isolate: return "(\n" + sql + "\n) a\n", post else: return sql, post
def es_aggop(es, mvel, query): select = listwrap(query.select) FromES = build_es_query(query) isSimple = AND(aggregates[s.aggregate] == "count" for s in select) if isSimple: return es_countop(es, query) # SIMPLE, USE TERMS FACET INSTEAD value2facet = dict() # ONLY ONE FACET NEEDED PER name2facet = dict() # MAP name TO FACET WITH STATS for s in select: if s.value not in value2facet: if is_keyword(s.value): unwrap(FromES.facets)[s.name] = { "statistical": { "field": s.value }, "facet_filter": simplify_esfilter(query.where) } else: unwrap(FromES.facets)[s.name] = { "statistical": { "script": es09.expressions.compile_expression(s.value, query) }, "facet_filter": simplify_esfilter(query.where) } value2facet[s.value] = s.name name2facet[s.name] = value2facet[s.value] data = es09.util.post(es, FromES, query.limit) matricies = { s.name: Matrix(value=fix_es_stats(data.facets[literal_field(s.name)])[ aggregates[s.aggregate]]) for s in select } cube = Cube(query.select, [], matricies) cube.frum = query return cube
def query(self, _query): try: if not self.ready: Log.error("Must use with clause for any instance of FromES") query = Query(_query, schema=self) # try: # frum = self.get_columns(query["from"]) # mvel = _MVEL(frum) # except Exception, e: # mvel = None # Log.warning("TODO: Fix this", e) # for s in listwrap(query.select): if not aggregates1_4[s.aggregate]: Log.error("ES can not aggregate " + self.select[0].name + " because '" + self.select[0].aggregate + "' is not a recognized aggregate") frum = query["from"] if isinstance(frum, Query): result = self.query(frum) q2 = query.copy() q2.frum = result return qb.run(q2) if is_aggsop(self._es, query): return es_aggsop(self._es, frum, query) if is_fieldop(self._es, query): return es_fieldop(self._es, query) if is_setop(self._es, query): return es_setop(self._es, query) Log.error("Can not handle") except Exception, e: e = Except.wrap(e) if "Data too large, data for" in e: http.post(self._es.cluster.path + "/_cache/clear") Log.error("Problem (Tried to clear Elasticsearch cache)", e) Log.error("problem", e)
def wrap(cls, e, stack_depth=0): if e == None: return Null elif isinstance(e, (list, Except)): return e elif isinstance(e, Mapping): e.cause = unwraplist([Except.wrap(c) for c in listwrap(e.cause)]) return Except(**e) else: if hasattr(e, "message") and e.message: cause = Except(ERROR, unicode(e.message), trace=_extract_traceback(0)) else: cause = Except(ERROR, unicode(e), trace=_extract_traceback(0)) trace = extract_stack( stack_depth + 2 ) # +2 = to remove the caller, and it's call to this' Except.wrap() cause.trace.extend(trace) return cause
def query(self, _query): try: query = QueryOp.wrap(_query, schema=self) for n in self.namespaces: query = n.convert(query) if self.typed: query = Typed().convert(query) for s in listwrap(query.select): if not aggregates1_4.get(s.aggregate): Log.error( "ES can not aggregate {{name}} because {{aggregate|quote}} is not a recognized aggregate", name=s.name, aggregate=s.aggregate) frum = query["from"] if isinstance(frum, QueryOp): result = self.query(frum) q2 = query.copy() q2.frum = result return jx.run(q2) if is_deepop(self._es, query): return es_deepop(self._es, query) if is_aggsop(self._es, query): return es_aggsop(self._es, frum, query) if is_setop(self._es, query): return es_setop(self._es, query) if es09_setop.is_setop(query): return es09_setop.es_setop(self._es, None, query) if es09_aggop.is_aggop(query): return es09_aggop.es_aggop(self._es, None, query) Log.error("Can not handle") except Exception, e: e = Except.wrap(e) if "Data too large, data for" in e: http.post(self._es.cluster.path + "/_cache/clear") Log.error("Problem (Tried to clear Elasticsearch cache)", e) Log.error("problem", e)
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "range" self.NULL = Null if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") parts = listwrap(self.partitions) for i, p in enumerate(parts): self.min = Math.min(self.min, p.min) self.max = Math.max(self.max, p.max) if p.dataIndex != None and p.dataIndex != i: Log.error( "Expecting `dataIndex` to agree with the order of the parts" ) if p[self.key] == None: Log.error( "Expecting all parts to have {{key}} as a property", key=self.key) p.dataIndex = i # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE for p, q in itertools.product(parts, parts): if p.min <= q.min and q.min < p.max: Log.error("partitions overlap!") self.partitions = parts return elif any([self.min == None, self.max == None, self.interval == None]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = wrap([{ "min": v, "max": v + self.interval, "dataIndex": i } for i, v in enumerate(frange(self.min, self.max, self.interval))])
def es_countop(es, mvel, query): """ RETURN SINGLE COUNT """ select = listwrap(query.select) FromES = build_es_query(query) for s in select: if is_keyword(s.value): FromES.facets[s.name] = { "terms": { "field": s.value, "size": query.limit, }, "facet_filter": { "exists": { "field": s.value } } } else: # COMPLICATED value IS PROBABLY A SCRIPT, USE IT FromES.facets[s.name] = { "terms": { "script_field": es09.expressions.compile_expression(s.value, query), "size": 200000 } } data = es09.util.post(es, FromES, query.limit) matricies = {} for s in select: matricies[s.name] = Matrix(value=data.hits.facets[s.name].total) cube = Cube(query.select, query.edges, matricies) cube.frum = query return cube