def get(url): """ USE json.net CONVENTIONS TO LINK TO INLINE OTHER JSON """ if not _Log: _late_import() if url.find("://") == -1: _Log.error("{{url}} must have a prototcol (eg http://) declared", url=url) base = URL("") if url.startswith("file://") and url[7] != "/": if os.sep=="\\": base = URL("file:///" + os.getcwd().replace(os.sep, "/").rstrip("/") + "/.") else: base = URL("file://" + os.getcwd().rstrip("/") + "/.") elif url[url.find("://") + 3] != "/": _Log.error("{{url}} must be absolute", url=url) phase1 = _replace_ref(wrap({"$ref": url}), base) # BLANK URL ONLY WORKS IF url IS ABSOLUTE try: phase2 = _replace_locals(phase1, [phase1]) return wrap(phase2) except Exception, e: _Log.error("problem replacing locals in\n{{phase1}}", phase1=phase1, cause=e)
def test_empty_dict(self): value = wrap({"match_all": wrap({})}) test1 = typed_encode(value) test2 = json2typed(pypy_json_encode(value)) expected = u'{"$object": ".", "match_all": {"$object": "."}}' self.assertEqual(test1, expected) self.assertEqual(test2, expected)
def convert(self, expr): """ EXPAND INSTANCES OF name TO value """ if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_keyword(expr): return coalesce(self.dimensions[expr], expr) elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, QueryOp): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({name: self.convert(value) for name, value in expr.leaves()}) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return converter_map.get(k, self._convert_bop)(self, k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr]) else: return expr
def append_query(self, es_query, start): self.start = start domain = self.domain field = self.edge.value if isinstance(field, Variable): key = domain.key if isinstance(key, (tuple, list)) and len(key)==1: key = key[0] include = [p[key] for p in domain.partitions] if self.edge.allowNulls: return wrap({"aggs": { "_match": set_default({"terms": { "field": field.var, "size": self.limit, "include": include }}, es_query), "_missing": set_default( {"filter": {"or": [ field.missing().to_esfilter(), {"not": {"terms": {field.var: include}}} ]}}, es_query ), }}) else: return wrap({"aggs": { "_match": set_default({"terms": { "field": field.var, "size": self.limit, "include": include }}, es_query) }}) else: include = [p[domain.key] for p in domain.partitions] if self.edge.allowNulls: return wrap({"aggs": { "_match": set_default({"terms": { "script_field": field.to_ruby(), "size": self.limit, "include": include }}, es_query), "_missing": set_default( {"filter": {"or": [ field.missing().to_esfilter(), NotOp("not", InOp("in", [field, Literal("literal", include)])).to_esfilter() ]}}, es_query ), }}) else: return wrap({"aggs": { "_match": set_default({"terms": { "script_field": field.to_ruby(), "size": self.limit, "include": include }}, es_query) }})
def __init__(self, value): if not _convert: _late_import() try: self.scheme = None self.host = None self.port = None self.path = "" self.query = "" self.fragment = "" if value == None: return if value.startswith("file://") or value.startswith("//"): # urlparse DOES NOT WORK IN THESE CASES scheme, suffix = value.split("//") self.scheme = scheme.rstrip(":") parse(self, suffix, 0, 1) self.query = wrap(_convert.url_param2value(self.query)) else: output = urlparse(value) self.scheme = output.scheme self.port = output.port self.host = output.netloc.split(":")[0] self.path = output.path self.query = wrap(_convert.url_param2value(output.query)) self.fragment = output.fragment except Exception, e: _Log.error("problem parsing {{value}} to URL", value=value, cause=e)
def append_query(self, es_query, start): self.start = start if not isinstance(self.edge.value, Variable): script_field = self.edge.value.to_ruby() missing = self.edge.value.missing() output = wrap({"aggs": { "_match": set_default( {"terms": { "script_field": script_field, "size": self.domain.limit }}, es_query ), "_missing": set_default({"filter": missing.to_esfilter()}, es_query) if missing else None }}) return output output = wrap({"aggs": { "_match": set_default( {"terms": { "field": self.edge.value.var, "size": self.domain.limit }}, es_query ), "_missing": set_default({"missing": {"field": self.edge.value}}, es_query) # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER }}) return output
def _select_a_field(field): if isinstance(field, basestring): return wrap({"name": field, "value": split_field(field)}) elif isinstance(wrap(field).value, basestring): field = wrap(field) return wrap({"name": field.name, "value": split_field(field.value)}) else: return wrap({"name": field.name, "value": field.value})
def search(self, query): query = wrap(query) f = jx.get(query.query.filtered.filter) filtered = wrap([{"_id": i, "_source": d} for i, d in self.data.items() if f(d)]) if query.fields: return wrap({"hits": {"total": len(filtered), "hits": [{"_id": d._id, "fields": unwrap(jx.select([unwrap(d._source)], query.fields)[0])} for d in filtered]}}) else: return wrap({"hits": {"total": len(filtered), "hits": filtered}})
def iter(data, depth): if depth == 0: for v in data: yield wrap(v) return for v in data.values(): for v1 in iter(v, depth - 1): yield wrap(v1)
def list2tab(rows): columns = set() for r in wrap(rows): columns |= set(k for k, v in r.leaves()) keys = list(columns) output = [] for r in wrap(rows): output.append("\t".join(value2json(r[k]) for k in keys)) return "\t".join(keys) + "\n" + "\n".join(output)
def __init__(self, select, edges, data, frum=None): """ data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE ALLOWED, USING THE select AND edges TO DESCRIBE THE data """ self.is_value = False if isinstance(select, list) else True self.select = select self.meta = Data(format="cube") # PUT EXTRA MARKUP HERE self.is_none = False if not all(data.values()): is_none = True # ENSURE frum IS PROPER FORM if isinstance(select, list): if edges and OR(not isinstance(v, Matrix) for v in data.values()): Log.error("Expecting data to be a dict with Matrix values") if not edges: if not data: if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix.ZERO} self.edges = FlatList.EMPTY elif isinstance(data, Mapping): # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA length = MAX([len(v) for v in data.values()]) if length >= 1: self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum"}}]) else: self.edges = FlatList.EMPTY elif isinstance(data, list): if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix.wrap(data)} self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}]) elif isinstance(data, Matrix): if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: data} else: if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix(value=data)} self.edges = FlatList.EMPTY else: self.edges = wrap(edges) self.data = data
def __getitem__(self, item): # TODO: SOLVE FUNDAMENTAL QUESTION OF IF SELECTING A PART OF AN # EDGE REMOVES THAT EDGE FROM THIS RESULT, OR ADDS THE PART # AS A select {"name":edge.name, "value":edge.domain.partitions[coord]} # PROBABLY NOT, THE value IS IDENTICAL OVER THE REMAINING if isinstance(item, Mapping): coordinates = [None] * len(self.edges) # MAP DICT TO NUMERIC INDICES for name, v in item.items(): ei, parts = wrap([(i, e.domain.partitions) for i, e in enumerate(self.edges) if e.name == name])[0] if not parts: Log.error("Can not find {{name}}=={{value|quote}} in list of edges, maybe this feature is not implemented yet", name= name, value= v) part = wrap([p for p in parts if p.value == v])[0] if not part: return Null else: coordinates[ei] = part.dataIndex edges = [e for e, v in zip(self.edges, coordinates) if v is None] if not edges: # ZERO DIMENSIONAL VALUE return wrap({k: v.__getitem__(coordinates) for k, v in self.data.items()}) else: output = Cube( select=self.select, edges=wrap([e for e, v in zip(self.edges, coordinates) if v is None]), data={k: Matrix(values=c.__getitem__(coordinates)) for k, c in self.data.items()} ) return output elif isinstance(item, basestring): # RETURN A VALUE CUBE if self.is_value: if item != self.select.name: Log.error("{{name}} not found in cube", name= item) return self if item not in self.select.name: Log.error("{{name}} not found in cube", name= item) output = Cube( select=[s for s in self.select if s.name == item][0], edges=self.edges, data={item: self.data[item]} ) return output else: Log.error("not implemented yet")
def __getitem__(self, key): try: _key = value2key(self._keys, key) if len(self._keys) == 1 or len(_key) == len(self._keys): d = self._data.get(_key) return wrap(d) else: output = wrap([ d for d in self._data.values() if all(wrap(d)[k] == v for k, v in _key.items()) ]) return output except Exception, e: Log.error("something went wrong", e)
def get_schema(self, retry=True): if self.settings.explore_metadata: metadata = self.cluster.get_metadata() index = metadata.indices[self.settings.index] if index == None and retry: #TRY AGAIN, JUST IN CASE self.cluster.cluster_state = None return self.get_schema(retry=False) if not index.mappings[self.settings.type]: Log.error( "ElasticSearch index {{index|quote}} does not have type {{type|quote}} in {{metadata|json}}", index=self.settings.index, type=self.settings.type, metadata=metadata ) return index.mappings[self.settings.type] else: mapping = self.cluster.get(self.path + "/_mapping") if not mapping[self.settings.type]: Log.error( "ElasticSearch index {{index|quote}} does not have type {{type|quote}}", index=self.settings.index, type=self.settings.type ) return wrap({"mappings": mapping[self.settings.type]})
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce(kwargs.depth, len(self.fields)-1 if isinstance(self.fields, list) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = FlatList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name":part.name, "value":part.value, "where":part.where, "style":coalesce(part.style, part.parent.style), "weight":part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name":v.name, "value":v.value, "where":v.where, "style":v.style, "weight":v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT)] elif kwargs.depth == 1: partitions = FlatList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name":join_field(split_field(subpart.parent.name) + [subpart.name]), "value":subpart.value, "where":subpart.where, "style":coalesce(subpart.style, subpart.parent.style), "weight":subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception, e: Log.error("", e)
def _convert_edge(self, edge): if isinstance(edge, basestring): return Data( name=edge, value=edge, domain=self._convert_domain() ) else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge= edge) if isinstance(edge.value, (Mapping, list)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain =self._convert_domain() domain.dimension = Data(fields=edge.value) return Data( name=edge.name, allowNulls=False if edge.allowNulls is False else True, domain=domain ) domain = self._convert_domain(edge.domain) return Data( name=coalesce(edge.name, edge.value), value=edge.value, range=edge.range, allowNulls=False if edge.allowNulls is False else True, domain=domain )
def test_complex_object(self): value = wrap({"s": 0, "r": 5}) test1 = typed_encode(value) test2 = json2typed(pypy_json_encode(value)) expected = u'{"$object": ".", "s": {"$value": 0}, "r": {"$value": 5}}' self.assertEqual(test1, expected) self.assertEqual(test2, expected)
def append_query(self, es_query, start): self.start = start parts = self.edge.domain.partitions filters = [] notty = [] for p in parts: filters.append(AndOp("and", [p.where]+notty).to_esfilter()) notty.append(NotOp("not", p.where)) missing_filter = None if self.edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER missing_filter = set_default( {"filter": AndOp("and", notty).to_esfilter()}, es_query ) return wrap({"aggs": { "_match": set_default( {"filters": {"filters": filters}}, es_query ), "_missing": missing_filter }})
def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: Till(seconds=1).wait() messages = wrap(self.queue.pop_all()) if not messages: continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] try: for i, message in enumerate(mm): if message is Thread.STOP: please_stop.go() return scrubbed.append(_deep_json_to_string(message, depth=3)) finally: self.es.extend(scrubbed) bad_count = 0 except Exception, e: Log.warning("Problem inserting logs into ES", cause=e) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index) Till(seconds=30).wait()
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "range" self.NULL = Null if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") parts = listwrap(self.partitions) for i, p in enumerate(parts): self.min = Math.min(self.min, p.min) self.max = Math.max(self.max, p.max) if p.dataIndex != None and p.dataIndex != i: Log.error("Expecting `dataIndex` to agree with the order of the parts") if p[self.key] == None: Log.error("Expecting all parts to have {{key}} as a property", key=self.key) p.dataIndex = i # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE for p, q in itertools.product(parts, parts): if p.min <= q.min and q.min < p.max: Log.error("partitions overlap!") self.partitions = parts return elif any([self.min == None, self.max == None, self.interval == None]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = wrap([{"min": v, "max": v + self.interval, "dataIndex": i} for i, v in enumerate(frange(self.min, self.max, self.interval))])
def __init__(self, **desc): desc = wrap(desc) self._set_slots_to_null(self.__class__) set_default(self, desc) self.name = coalesce(desc.name, desc.type) self.isFacet = coalesce(desc.isFacet, False) self.dimension = Null
def list2cube(rows, column_names=None): if column_names: keys = column_names else: columns = set() for r in rows: columns |= set(r.keys()) keys = list(columns) data = {k: [] for k in keys} output = wrap({ "meta": {"format": "cube"}, "edges": [ { "name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(rows), "interval": 1} } ], "data": data }) for r in rows: for k in keys: data[k].append(unwraplist(r[k])) return output
def error( cls, template, # human readable template default_params={}, # parameters for template cause=None, # pausible cause stack_depth=0, **more_params ): """ raise an exception with a trace for the cause too :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if default_params and isinstance(listwrap(default_params)[0], BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) add_to_trace = False cause = wrap(unwraplist([Except.wrap(c, stack_depth=1) for c in listwrap(cause)])) trace = exceptions.extract_stack(stack_depth + 1) if add_to_trace: cause[0].trace.extend(trace[1:]) e = Except(exceptions.ERROR, template, params, cause, trace) raise e
def _normalize_groupby(groupby, schema=None): if groupby == None: return None output = wrap([_normalize_group(e, schema=schema) for e in listwrap(groupby)]) if any(o==None for o in output): Log.error("not expected") return output
def json2value(json_string, params={}, flexible=False, leaves=False): """ :param json_string: THE JSON :param params: STANDARD JSON PARAMS :param flexible: REMOVE COMMENTS :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED :return: Python value """ if isinstance(json_string, str): Log.error("only unicode json accepted") try: if flexible: # REMOVE """COMMENTS""", # COMMENTS, //COMMENTS, AND \n \r # DERIVED FROM https://github.com/jeads/datasource/blob/master/datasource/bases/BaseHub.py# L58 json_string = re.sub(r"\"\"\".*?\"\"\"", r"\n", json_string, flags=re.MULTILINE) json_string = "\n".join(remove_line_comment(l) for l in json_string.split("\n")) # ALLOW DICTIONARY'S NAME:VALUE LIST TO END WITH COMMA json_string = re.sub(r",\s*\}", r"}", json_string) # ALLOW LISTS TO END WITH COMMA json_string = re.sub(r",\s*\]", r"]", json_string) if params: # LOOKUP REFERENCES json_string = expand_template(json_string, params) try: value = wrap(json_decoder(unicode(json_string))) except Exception, e: Log.error("can not decode\n{{content}}", content=json_string, cause=e) if leaves: value = wrap_leaves(value) return value
def forall(self, sql, param=None, _execute=None): assert _execute num = 0 self._execute_backlog() try: old_cursor = self.cursor if not old_cursor: # ALLOW NON-TRANSACTIONAL READS self.cursor = self.db.cursor() if param: sql = expand_template(sql, self.quote_param(param)) sql = self.preamble + outdent(sql) if self.debug: Log.note("Execute SQL:\n{{sql}}", sql= indent(sql)) self.cursor.execute(sql) columns = tuple([utf8_to_unicode(d[0]) for d in self.cursor.description]) for r in self.cursor: num += 1 _execute(wrap(dict(zip(columns, [utf8_to_unicode(c) for c in r])))) if not old_cursor: # CLEANUP AFTER NON-TRANSACTIONAL READS self.cursor.close() self.cursor = None except Exception, e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql= sql, cause=e, stack_depth=1)
def _range_composer(edge, domain, es_query, to_float): # USE RANGES _min = coalesce(domain.min, MAX(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if isinstance(edge.value, Variable): calc = {"field": edge.value.var} else: calc = {"script_field": edge.value.to_ruby()} if edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER missing_filter = set_default( {"filter": {"or": [ OrOp("or", [ InequalityOp("lt", [edge.value, Literal(None, to_float(_min))]), InequalityOp("gte", [edge.value, Literal(None, to_float(_max))]), ]).to_esfilter(), edge.value.missing().to_esfilter() ]}}, es_query ) else: missing_filter = None return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": missing_filter }})
def scrub(r): """ REMOVE KEYS OF DEGENERATE VALUES (EMPTY STRINGS, EMPTY LISTS, AND NULLS) CONVERT STRINGS OF NUMBERS TO NUMBERS RETURNS **COPY**, DOES NOT CHANGE ORIGINAL """ return wrap(_scrub(r))
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "time" self.NULL = Null self.min = Date(self.min) self.max = Date(self.max) self.interval = Duration(self.interval) if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") Log.error("not implemented yet") # VERIFY PARTITIONS DO NOT OVERLAP return elif not all([self.min, self.max, self.interval]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = wrap([ {"min": v, "max": v + self.interval, "dataIndex": i} for i, v in enumerate(Date.range(self.min, self.max, self.interval)) ])
def test_empty_list2(self): value = wrap({"a": [], "b": 1}) test1 = typed_encode(value) test2 = json2typed(pypy_json_encode(value)) expected = u'{"$object": ".", "a": [], "b": {"$value": 1}}' self.assertEqual(test1, expected) self.assertEqual(test2, expected)