def __getitem__(self, key): if key == None: return Null if key == ".": output = self._internal_dict if isinstance(output, Mapping): return self else: return output key = text_type(key) d = self._internal_dict if key.find(".") >= 0: seq = _split_field(key) for n in seq: if isinstance(d, NullType): d = NullType(d, n) # OH DEAR, Null TREATS n AS PATH, NOT LITERAL elif isinstance(d, list): d = [_getdefault(dd, n) for dd in d] else: d = _getdefault(d, n) # EVERYTHING ELSE TREATS n AS LITERAL return wrap(d) else: o = d.get(key) if o == None: return NullType(d, key) return wrap(o)
def convert(self, expr): """ EXPAND INSTANCES OF name TO value """ if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_variable_name(expr): return coalesce(self.dimensions[expr], expr) elif isinstance(expr, text_type): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, QueryOp): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({name: self.convert(value) for name, value in expr.leaves()}) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return converter_map.get(k, self._convert_bop)(self, k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr]) else: return expr
def query(self, sql, param=None, stream=False, row_tuples=False): """ RETURN LIST OF dicts """ if not self.cursor: # ALLOW NON-TRANSACTIONAL READS Log.error("must perform all queries inside a transaction") self._execute_backlog() try: if param: sql = expand_template(sql, quote_param(param)) sql = self.preamble + outdent(sql) self.debug and Log.note("Execute SQL:\n{{sql}}", sql=indent(sql)) self.cursor.execute(sql) if row_tuples: if stream: result = self.cursor else: result = wrap(list(self.cursor)) else: columns = [utf8_to_unicode(d[0]) for d in coalesce(self.cursor.description, [])] if stream: result = (wrap({c: utf8_to_unicode(v) for c, v in zip(columns, row)}) for row in self.cursor) else: result = wrap([{c: utf8_to_unicode(v) for c, v in zip(columns, row)} for row in self.cursor]) return result except Exception as e: e = Except.wrap(e) if "InterfaceError" in e: Log.error("Did you close the db connection?", e) Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def __init__(self, value): try: self.scheme = None self.host = None self.port = None self.path = "" self.query = "" self.fragment = "" if value == None: return if value.startswith("file://") or value.startswith("//"): # urlparse DOES NOT WORK IN THESE CASES scheme, suffix = value.split("//", 2) self.scheme = scheme.rstrip(":") parse(self, suffix, 0, 1) self.query = wrap(url_param2value(self.query)) else: output = urlparse(value) self.scheme = output.scheme self.port = output.port self.host = output.netloc.split(":")[0] self.path = output.path self.query = wrap(url_param2value(output.query)) self.fragment = output.fragment except Exception as e: Log.error("problem parsing {{value}} to URL", value=value, cause=e)
def __init__(self, **desc): Domain.__init__(self, **desc) self.type = "range" self.NULL = Null if self.partitions: # IGNORE THE min, max, interval if not self.key: Log.error("Must have a key value") parts =listwrap(self.partitions) for i, p in enumerate(parts): self.min = MIN([self.min, p.min]) self.max = MAX([self.max, p.max]) if p.dataIndex != None and p.dataIndex != i: Log.error("Expecting `dataIndex` to agree with the order of the parts") if p[self.key] == None: Log.error("Expecting all parts to have {{key}} as a property", key=self.key) p.dataIndex = i # VERIFY PARTITIONS DO NOT OVERLAP, HOLES ARE FINE for p, q in itertools.product(parts, parts): if p.min <= q.min and q.min < p.max and unwrap(p) is not unwrap(q): Log.error("partitions overlap!") self.partitions = wrap(parts) return elif any([self.min == None, self.max == None, self.interval == None]): Log.error("Can not handle missing parameter") self.key = "min" self.partitions = wrap([{"min": v, "max": v + self.interval, "dataIndex": i} for i, v in enumerate(frange(self.min, self.max, self.interval))])
def filter(data, where): """ where - a function that accepts (record, rownum, rows) and returns boolean """ if len(data) == 0 or where == None or where == TRUE: return data if isinstance(data, Container): return data.filter(where) if is_container(data): temp = jx_expression_to_function(where) dd = wrap(data) return wrap([unwrap(d) for i, d in enumerate(data) if temp(wrap(d), i, dd)]) else: Log.error( "Do not know how to handle type {{type}}", type=data.__class__.__name__ ) try: return drill_filter(where, data) except Exception as _: # WOW! THIS IS INEFFICIENT! return wrap( [unwrap(d) for d in drill_filter(where, [DataObject(d) for d in data])] )
def __getitem__(self, key): if key == None: return Null if key == ".": output = _get(self, "_dict") if isinstance(output, Mapping): return self else: return output if isinstance(key, str): key = key.decode("utf8") elif not isinstance(key, unicode): get_logger().error("only string keys are supported") d = _get(self, "_dict") if key.find(".") >= 0: seq = _split_field(key) for n in seq: if isinstance(d, NullType): d = NullType(d, n) # OH DEAR, Null TREATS n AS PATH, NOT LITERAL elif isinstance(d, list): d = [_getdefault(dd, n) for dd in d] else: d = _getdefault(d, n) # EVERYTHING ELSE TREATS n AS LITERAL return wrap(d) else: o = d.get(key) if o == None: return NullType(d, key) return wrap(o)
def _select_a_field(field): if isinstance(field, basestring): return wrap({"name": field, "value": split_field(field)}) elif isinstance(wrap(field).value, basestring): field = wrap(field) return wrap({"name": field.name, "value": split_field(field.value)}) else: return wrap({"name": field.name, "value": field.value})
def _select_a_field(field): if is_text(field): return wrap({"name": field, "value": split_field(field)}) elif is_text(wrap(field).value): field = wrap(field) return wrap({"name": field.name, "value": split_field(field.value)}) else: return wrap({"name": field.name, "value": field.value})
def search(self, query): query = wrap(query) f = jx.get(query.query.filtered.filter) filtered = wrap([{"_id": i, "_source": d} for i, d in self.data.items() if f(d)]) if query.fields: return wrap({"hits": {"total": len(filtered), "hits": [{"_id": d._id, "fields": unwrap(jx.select([unwrap(d._source)], query.fields)[0])} for d in filtered]}}) else: return wrap({"hits": {"total": len(filtered), "hits": filtered}})
def run(query, container=Null): """ THIS FUNCTION IS SIMPLY SWITCHING BASED ON THE query["from"] CONTAINER, BUT IT IS ALSO PROCESSING A list CONTAINER; SEPARATE TO A ListContainer """ if container == None: container = wrap(query)['from'] query_op = QueryOp.wrap(query, container=container, namespace=container.schema) else: query_op = QueryOp.wrap(query, container, container.namespace) if container == None: from jx_python.containers.list_usingPythonList import DUAL return DUAL.query(query_op) elif isinstance(container, Container): return container.query(query_op) elif isinstance(container, (list, set) + generator_types): container = wrap(list(container)) elif isinstance(container, Cube): if is_aggs(query_op): return cube_aggs(container, query_op) elif isinstance(container, QueryOp): container = run(container) else: Log.error("Do not know how to handle {{type}}", type=container.__class__.__name__) if is_aggs(query_op): container = list_aggs(container, query_op) else: # SETOP if query_op.where is not TRUE: container = filter(container, query_op.where) if query_op.sort: container = sort(container, query_op.sort, already_normalized=True) if query_op.select: container = select(container, query_op.select) if query_op.window: if isinstance(container, Cube): container = list(container.values()) for param in query_op.window: window(container, param) # AT THIS POINT frum IS IN LIST FORMAT, NOW PACKAGE RESULT if query_op.format == "cube": container = convert.list2cube(container) elif query_op.format == "table": container = convert.list2table(container) container.meta.format = "table" else: container = wrap({ "meta": {"format": "list"}, "data": container }) return container
def iter(data, depth): if depth == 0: for v in data: yield wrap(v) return for v in data.values(): for v1 in iter(v, depth - 1): yield wrap(v1)
def simplify_esfilter(esfilter): try: output = wrap(_normalize(wrap(esfilter))) output.isNormal = None return output except Exception as e: from mo_logs import Log Log.unexpected("programmer error", cause=e)
def _normalize_group(edge, dim_index, schema=None): """ :param edge: Not normalized groupby :param dim_index: Dimensions are ordered; this is this groupby's index into that order :param schema: for context :return: a normalized groupby """ if isinstance(edge, basestring): if edge.endswith(".*"): prefix = edge[:-1] if schema: output = wrap([ { "name": literal_field(k), "value": jx_expression(k), "allowNulls": True, "domain": {"type": "default"} } for k, cs in schema.items() if k.startswith(prefix) for c in cs if c.type not in STRUCT ]) return output else: return wrap([{ "name": edge[:-2], "value": jx_expression(edge[:-2]), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }]) return wrap([{ "name": edge, "value": jx_expression(edge), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }]) else: edge = wrap(edge) if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None: Log.error("groupby does not accept complicated domains") if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge= edge) return wrap([{ "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }])
def __data__(self): if first(self.schema.columns).name=='.': return wrap({ "meta": {"format": "list"}, "data": self.data }) else: return wrap({ "meta": {"format": "list"}, "data": [{k: unwraplist(v) for k, v in row.items()} for row in self.data] })
def __init__(self, select, edges, data, frum=None): """ data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE ALLOWED, USING THE select AND edges TO DESCRIBE THE data """ self.is_value = False if is_list(select) else True self.select = select self.meta = Data(format="cube") # PUT EXTRA MARKUP HERE self.is_none = False if not all(data.values()): is_none = True # ENSURE frum IS PROPER FORM if is_list(select): if edges and OR(not isinstance(v, Matrix) for v in data.values()): Log.error("Expecting data to be a dict with Matrix values") if not edges: if not data: if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix.ZERO} self.edges = FlatList.EMPTY elif is_data(data): # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA length = MAX([len(v) for v in data.values()]) if length >= 1: self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum"}}]) else: self.edges = FlatList.EMPTY elif is_list(data): if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix.wrap(data)} self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}]) elif isinstance(data, Matrix): if is_list(select): Log.error("not expecting a list of records") data = {select.name: data} else: if is_list(select): Log.error("not expecting a list of records") data = {select.name: Matrix(value=data)} self.edges = FlatList.EMPTY else: self.edges = wrap(edges) self.data = data
def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[Date(t).format() for t in wrap(old_columns).last_updated] ) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG): if column.es_index in self.index_does_not_exist: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) continue if column.jx_type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE): column.last_updated = Date.now() continue elif column.last_updated >= Date.now()-TOO_OLD: continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) else: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e)
def get_columns(data, leaves=False): # TODO Split this into two functions if not leaves: return wrap([{"name": n} for n in UNION(set(d.keys()) for d in data)]) else: return wrap( [ {"name": leaf} for leaf in set(leaf for row in data for leaf, _ in row.leaves()) ] )
def _normalize_group(edge, dim_index, limit, schema=None): """ :param edge: Not normalized groupby :param dim_index: Dimensions are ordered; this is this groupby's index into that order :param schema: for context :return: a normalized groupby """ if isinstance(edge, text_type): if edge.endswith(".*"): prefix = edge[:-2] if schema: output = wrap([ { "name": concat_field(prefix, literal_field(relative_field(untype_path(c.names["."]), prefix))), "put": {"name": literal_field(untype_path(c.names["."]))}, "value": jx_expression(c.es_column, schema=schema), "allowNulls": True, "domain": {"type": "default"} } for c in schema.leaves(prefix) ]) return output else: return wrap([{ "name": untype_path(prefix), "put": {"name": literal_field(untype_path(prefix))}, "value": jx_expression(prefix, schema=schema), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }]) return wrap([{ "name": edge, "value": jx_expression(edge, schema=schema), "allowNulls": True, "dim": dim_index, "domain": Domain(type="default", limit=limit) }]) else: edge = wrap(edge) if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None: Log.error("groupby does not accept complicated domains") if not edge.name and not isinstance(edge.value, text_type): Log.error("You must name compound edges: {{edge}}", edge= edge) return wrap([{ "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value, schema=schema), "allowNulls": True, "dim":dim_index, "domain": {"type": "default"} }])
def list2tab(rows): columns = set() for r in wrap(rows): columns |= set(k for k, v in r.leaves()) keys = list(columns) output = [] for r in wrap(rows): output.append("\t".join(value2json(r[k]) for k in keys)) return "\t".join(keys) + "\n" + "\n".join(output)
def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[Date(t).format() for t in wrap(old_columns).last_updated] ) self.todo.extend(old_columns) else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG): if column.es_index in self.index_does_not_exist: DEBUG and Log.note("{{column.es_column}} does not exist", column=column) self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) continue if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) else: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e)
def append_query(self, es_query, start): self.start = start if not isinstance(self.edge.value, Variable): if self.exists is TRUE: # IF True THEN WE DO NOT NEED THE _filter OR THE _missing (THIS RARELY HAPPENS THOUGH) output = wrap({"aggs": { "_match": set_default( {"terms": { "script": self.script.expr, "size": self.domain.limit, "order": self.es_order }}, es_query ) }}) else: output = wrap({"aggs": { "_match": { # _match AND _filter REVERSED SO _match LINES UP WITH _missing "filter": self.exists.to_esfilter(self.schema), "aggs": { "_filter": set_default( {"terms": { "script": self.script.expr, "size": self.domain.limit, "order": self.es_order }}, es_query ) } }, "_missing": set_default( {"filter": self.missing.to_esfilter(self.schema)}, es_query ) }}) return output else: output = wrap({"aggs": { "_match": set_default( {"terms": { "field": self.schema.leaves(self.edge.value.var)[0].es_column, "size": self.domain.limit, "order": self.es_order }}, es_query ), "_missing": set_default( {"filter": self.missing.to_esfilter(self.schema)}, es_query ) }}) return output
def __getitem__(self, item): # TODO: SOLVE FUNDAMENTAL QUESTION OF IF SELECTING A PART OF AN # EDGE REMOVES THAT EDGE FROM THIS RESULT, OR ADDS THE PART # AS A select {"name":edge.name, "value":edge.domain.partitions[coord]} # PROBABLY NOT, THE value IS IDENTICAL OVER THE REMAINING if is_data(item): coordinates = [None] * len(self.edges) # MAP DICT TO NUMERIC INDICES for name, v in item.items(): ei, parts = wrap([(i, e.domain.partitions) for i, e in enumerate(self.edges) if e.name == name])[0] if not parts: Log.error("Can not find {{name}}=={{value|quote}} in list of edges, maybe this feature is not implemented yet", name= name, value= v) part = wrap([p for p in parts if p.value == v])[0] if not part: return Null else: coordinates[ei] = part.dataIndex edges = [e for e, v in zip(self.edges, coordinates) if v is None] if not edges: # ZERO DIMENSIONAL VALUE return wrap({k: v.__getitem__(coordinates) for k, v in self.data.items()}) else: output = Cube( select=self.select, edges=wrap([e for e, v in zip(self.edges, coordinates) if v is None]), data={k: Matrix(values=c.__getitem__(coordinates)) for k, c in self.data.items()} ) return output elif is_text(item): # RETURN A VALUE CUBE if self.is_value: if item != self.select.name: Log.error("{{name}} not found in cube", name= item) return self if item not in self.select.name: Log.error("{{name}} not found in cube", name= item) output = Cube( select=[s for s in self.select if s.name == item][0], edges=self.edges, data={item: self.data[item]} ) return output else: Log.error("not implemented yet")
def __getitem__(self, key): try: _key = value2key(self._keys, key) if len(self._keys) == 1 or len(_key) == len(self._keys): d = self._data.get(_key) return wrap(d) else: output = wrap([ d for d in self._data.values() if all(wrap(d)[k] == v for k, v in _key.items()) ]) return output except Exception as e: Log.error("something went wrong", e)
def _get_queue(self, row): row = wrap(row) if row.json: row.value, row.json = json2value(row.json), None timestamp = Date(self.rollover_field(row.value)) if timestamp == None: return Null elif timestamp < Date.today() - self.rollover_max: return DATA_TOO_OLD rounded_timestamp = timestamp.floor(self.rollover_interval) with self.locker: queue = self.known_queues.get(rounded_timestamp.unix) if queue == None: candidates = jx.run({ "from": ListContainer(".", self.cluster.get_aliases()), "where": {"regex": {"index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d"}}, "sort": "index" }) best = None for c in candidates: c = wrap(c) c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT) if timestamp > c.date: best = c if not best or rounded_timestamp > best.date: if rounded_timestamp < wrap(candidates[-1]).date: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) else: try: es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings) es.add_alias(self.settings.index) except Exception as e: e = Except.wrap(e) if "IndexAlreadyExistsException" not in e: Log.error("Problem creating index", cause=e) return self._get_queue(row) # TRY AGAIN else: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) with suppress_exception: es.set_refresh_interval(seconds=60 * 5, timeout=5) self._delete_old_indexes(candidates) threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True) with self.locker: queue = self.known_queues[rounded_timestamp.unix] = threaded_queue return queue
def __getattribute__(self, key): if key == b"__class__": return NullType key = key.decode('utf8') d = _get(self, "__dict__") o = wrap(d["_obj"]) k = d["__key__"] if o is None: return Null elif isinstance(o, NullType): return NullType(self, key) v = o.get(k) if v == None: return NullType(self, key) return wrap(v.get(key))
def find_holes(db_module, db, table_name, column_name, _range, filter=None): """ FIND HOLES IN A DENSE COLUMN OF INTEGERS RETURNS A LIST OF {"min"min, "max":max} OBJECTS """ if not filter: filter = {"match_all": {}} _range = wrap(_range) params = { "min": _range.min, "max": _range.max - 1, "column_name": db_module.quote_column(column_name), "table_name": db_module.quote_column(table_name), "filter": esfilter2sqlwhere(filter) } min_max = db.query(""" SELECT min({{column_name}}) `min`, max({{column_name}})+1 `max` FROM {{table_name}} a WHERE a.{{column_name}} BETWEEN {{min}} AND {{max}} AND {{filter}} """, params)[0] db.execute("SET @last={{min}}-1", {"min": _range.min}) ranges = db.query(""" SELECT prev_rev+1 `min`, curr_rev `max` FROM ( SELECT a.{{column_name}}-@last diff, @last prev_rev, @last:=a.{{column_name}} curr_rev FROM {{table_name}} a WHERE a.{{column_name}} BETWEEN {{min}} AND {{max}} AND {{filter}} ORDER BY a.{{column_name}} ) a WHERE diff>1 """, params) if ranges: ranges.append({"min": min_max.max, "max": _range.max}) else: if min_max.min: ranges.append({"min": _range.min, "max": min_max.min}) ranges.append({"min": min_max.max, "max": _range.max}) else: ranges.append(_range) return ranges
def _select_deep(v, field, depth, record): """ field = {"name":name, "value":["attribute", "path"]} r[field.name]=v[field.value], BUT WE MUST DEAL WITH POSSIBLE LIST IN field.value PATH """ if hasattr(field.value, "__call__"): try: record[field.name] = field.value(wrap(v)) except Exception as e: record[field.name] = None return 0, None for i, f in enumerate(field.value[depth : len(field.value) - 1 :]): v = v.get(f) if v is None: return 0, None if is_list(v): return depth + i + 1, v f = field.value.last() try: if not f: # NO NAME FIELD INDICATES SELECT VALUE record[field.name] = v else: record[field.name] = v.get(f) except Exception as e: Log.error( "{{value}} does not have {{field}} property", value=v, field=f, cause=e ) return 0, None
def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: Till(seconds=1).wait() messages = wrap(self.queue.pop_all()) if not messages: continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] try: for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() return scrubbed.append(_deep_json_to_string(message, depth=3)) finally: self.es.extend(scrubbed) bad_count = 0 except Exception as e: Log.warning("Problem inserting logs into ES", cause=e) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index) Till(seconds=30).wait() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=1).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e)
def _expand(template, seq): """ seq IS TUPLE OF OBJECTS IN PATH ORDER INTO THE DATA TREE """ if is_text(template): return _simple_expand(template, seq) elif is_data(template): # EXPAND LISTS OF ITEMS USING THIS FORM # {"from":from, "template":template, "separator":separator} template = wrap(template) assert template["from"], "Expecting template to have 'from' attribute" assert template.template, "Expecting template to have 'template' attribute" data = seq[-1][template["from"]] output = [] for d in data: s = seq + (d,) output.append(_expand(template.template, s)) return coalesce(template.separator, "").join(output) elif is_list(template): return "".join(_expand(t, seq) for t in template) else: if not _Log: _late_import() _Log.error("can not handle")
def getDomain(self, **kwargs): # kwargs.depth IS MEANT TO REACH INTO SUB-PARTITIONS kwargs = wrap(kwargs) kwargs.depth = coalesce( kwargs.depth, len(self.fields) - 1 if isinstance(self.fields, list) else None) if not self.partitions and self.edges: # USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP partitions = [ { "name": v.name, "value": v.name, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.edges) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where ] self.isFacet = True elif kwargs.depth == None: # ASSUME self.fields IS A dict partitions = FlatList() for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): break partitions.append({ "name": part.name, "value": part.value, "where": part.where, "style": coalesce(part.style, part.parent.style), "weight": part.weight # YO! WHAT DO WE *NOT* COPY? }) elif kwargs.depth == 0: partitions = [ { "name": v.name, "value": v.value, "where": v.where, "style": v.style, "weight": v.weight # YO! WHAT DO WE *NOT* COPY? } for i, v in enumerate(self.partitions) if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) ] elif kwargs.depth == 1: partitions = FlatList() rownum = 0 for i, part in enumerate(self.partitions): if i >= coalesce(self.limit, DEFAULT_QUERY_LIMIT): continue rownum += 1 try: for j, subpart in enumerate(part.partitions): partitions.append({ "name": join_field( split_field(subpart.parent.name) + [subpart.name]), "value": subpart.value, "where": subpart.where, "style": coalesce(subpart.style, subpart.parent.style), "weight": subpart.weight # YO! WHAT DO WE *NOT* COPY? }) except Exception as e: Log.error("", e) else: Log.error("deeper than 2 is not supported yet") return Domain( type=self.type, name=self.name, partitions=wrap(partitions), min=self.min, max=self.max, interval=self.interval, # THE COMPLICATION IS THAT SOMETIMES WE WANT SIMPLE PARTITIONS, LIKE # STRINGS, DATES, OR NUMBERS. OTHER TIMES WE WANT PARTITION OBJECTS # WITH NAME, VALUE, AND OTHER MARKUP. # USUALLY A "set" IS MEANT TO BE SIMPLE, BUT THE end() FUNCTION IS # OVERRIDES EVERYTHING AND IS EXPLICIT. - NOT A GOOD SOLUTION BECAUSE # end() IS USED BOTH TO INDICATE THE QUERY PARTITIONS *AND* DISPLAY # COORDINATES ON CHARTS # PLEASE SPLIT end() INTO value() (replacing the string value) AND # label() (for presentation) value="name" if not self.value and self.partitions else self.value, key="value", label=coalesce(self.label, (self.type == "set" and self.name)), end=coalesce(self.end, (self.type == "set" and self.name)), isFacet=self.isFacet, dimension=self)
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = wrap(command) schema = self._es.get_properties() # GET IDS OF DOCUMENTS results = self._es.search({ "stored_fields": listwrap(schema._routing.path), "query": { "bool": { "filter": jx_expression(command.where).to_esfilter(Null) } }, "size": 10000 }) # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT) scripts = FlatList() for k, v in command.set.items(): if not is_variable_name(k): Log.error("Only support simple paths for now") if isinstance(v, Mapping) and v.doc: scripts.append({"doc": v.doc}) else: v = scrub(v) scripts.append({ "script": "ctx._source." + k + " = " + jx_expression(v).to_painless(schema).script(schema) }) if results.hits.hits: updates = [] for h in results.hits.hits: for s in scripts: updates.append({ "update": { "_id": h._id, "_routing": unwraplist(h.fields[literal_field( schema._routing.path)]) } }) updates.append(s) content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode('utf-8') response = self._es.cluster.post( self._es.path + "/_bulk", data=content, headers={"Content-Type": "application/json"}, timeout=self.settings.timeout, params={ "wait_for_active_shards": self.settings.wait_for_active_shards }) if response.errors: Log.error("could not update: {{error}}", error=[ e.error for i in response["items"] for e in i.values() if e.status not in (200, 201) ])
def find_holes(db_module, db, table_name, column_name, _range, filter=None): """ FIND HOLES IN A DENSE COLUMN OF INTEGERS RETURNS A LIST OF {"min"min, "max":max} OBJECTS """ if not filter: filter = {"match_all": {}} _range = wrap(_range) params = { "min": _range.min, "max": _range.max - 1, "column_name": db_module.quote_column(column_name), "table_name": db_module.quote_column(table_name), "filter": esfilter2sqlwhere(filter) } min_max = db.query( """ SELECT min({{column_name}}) `min`, max({{column_name}})+1 `max` FROM {{table_name}} a WHERE a.{{column_name}} BETWEEN {{min}} AND {{max}} AND {{filter}} """, params)[0] db.execute("SET @last={{min}}-1", {"min": _range.min}) ranges = db.query( """ SELECT prev_rev+1 `min`, curr_rev `max` FROM ( SELECT a.{{column_name}}-@last diff, @last prev_rev, @last:=a.{{column_name}} curr_rev FROM {{table_name}} a WHERE a.{{column_name}} BETWEEN {{min}} AND {{max}} AND {{filter}} ORDER BY a.{{column_name}} ) a WHERE diff>1 """, params) if ranges: ranges.append({"min": min_max.max, "max": _range.max}) else: if min_max.min: ranges.append({"min": _range.min, "max": min_max.min}) ranges.append({"min": min_max.max, "max": _range.max}) else: ranges.append(_range) return ranges
"file": (f[0] if f[0] != "~" else "").replace("\\", "/"), "line": f[1], "method": f[2].lstrip("<").rstrip(">") } for f, d, in acc.stats.iteritems()] stats_file = File(profile_settings.filename, suffix=convert.datetime2string(datetime.now(), "_%Y%m%d_%H%M%S")) stats_file.write(convert.list2tab(stats)) # GET THE MACHINE METADATA machine_metadata = wrap({ "pid": os.getpid(), "python": text_type(platform.python_implementation()), "os": text_type(platform.system() + platform.release()).strip(), "name": text_type(platform.node()) }) def raise_from_none(e): raise e if PY3: exec("def raise_from_none(e):\n raise e from None\n", globals(), locals()) from mo_logs.log_usingFile import StructuredLogger_usingFile
def es_script(term): return wrap({"script": {"lang": "painless", "source": term}})
def append_query(self, es_query, start): self.start = start if not isinstance(self.edge.value, Variable): script_field = self.edge.value.to_ruby() missing = self.edge.value.missing() output = wrap({ "aggs": { "_match": set_default( { "terms": { "script_field": script_field, "size": self.domain.limit, "order": { "_term": self.sorted } if self.sorted else None } }, es_query), "_missing": set_default({"filter": missing.to_esfilter()}, es_query) if missing else None } }) return output elif self.edge.value.var in [s.value.var for s in self.query.sort]: sort_dir = [ s.sort for s in self.query.sort if s.value.var == self.edge.value.var ][0] output = wrap({ "aggs": { "_match": set_default( { "terms": { "field": self.edge.value.var, "size": self.domain.limit, "order": { "_term": "asc" if sort_dir == 1 else "desc" } } }, es_query), "_missing": set_default( {"missing": { "field": self.edge.value }}, es_query ) # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER } }) return output else: output = wrap({ "aggs": { "_match": set_default( { "terms": { "field": self.edge.value.var, "size": self.domain.limit } }, es_query), "_missing": set_default( {"missing": { "field": self.edge.value }}, es_query ) # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER } }) return output
def _output(): for g, v in itertools.groupby(data, get_key): group = Data() for k, gg in zip(keys, g): group[k] = gg yield (group, wrap(list(v)))
def __iter__(self): return (wrap(d) for d in self.data)
def request(method, url, headers=None, zip=None, retry=None, **kwargs): """ JUST LIKE requests.request() BUT WITH DEFAULT HEADERS AND FIXES DEMANDS data IS ONE OF: * A JSON-SERIALIZABLE STRUCTURE, OR * LIST OF JSON-SERIALIZABLE STRUCTURES, OR * None Parameters * zip - ZIP THE REQUEST BODY, IF BIG ENOUGH * json - JSON-SERIALIZABLE STRUCTURE * retry - {"times": x, "sleep": y} STRUCTURE THE BYTE_STRINGS (b"") ARE NECESSARY TO PREVENT httplib.py FROM **FREAKING OUT** IT APPEARS requests AND httplib.py SIMPLY CONCATENATE STRINGS BLINDLY, WHICH INCLUDES url AND headers """ global _warning_sent global request_count if not _warning_sent and not default_headers: Log.warning( text_type( "The pyLibrary.env.http module was meant to add extra " + "default headers to all requests, specifically the 'Referer' " + "header with a URL to the project. Use the `pyLibrary.debug.constants.set()` " + "function to set `pyLibrary.env.http.default_headers`")) _warning_sent = True if is_list(url): # TRY MANY URLS failures = [] for remaining, u in jx.countdown(url): try: response = request(method, u, retry=retry, **kwargs) if mo_math.round(response.status_code, decimal=-2) not in [400, 500]: return response if not remaining: return response except Exception as e: e = Except.wrap(e) failures.append(e) Log.error(u"Tried {{num}} urls", num=len(url), cause=failures) if 'session' in kwargs: session = kwargs['session'] del kwargs['session'] sess = Null else: sess = session = sessions.Session() with closing(sess): if PY2 and is_text(url): # httplib.py WILL **FREAK OUT** IF IT SEES ANY UNICODE url = url.encode('ascii') try: set_default(kwargs, {"zip": zip, "retry": retry}, DEFAULTS) _to_ascii_dict(kwargs) # HEADERS headers = kwargs['headers'] = unwrap( set_default(headers, session.headers, default_headers)) _to_ascii_dict(headers) del kwargs['headers'] # RETRY retry = wrap(kwargs['retry']) if isinstance(retry, Number): retry = set_default({"times": retry}, DEFAULTS['retry']) if isinstance(retry.sleep, Duration): retry.sleep = retry.sleep.seconds del kwargs['retry'] # JSON if 'json' in kwargs: kwargs['data'] = value2json(kwargs['json']).encode('utf8') del kwargs['json'] # ZIP set_default(headers, {'Accept-Encoding': 'compress, gzip'}) if kwargs['zip'] and len(coalesce(kwargs.get('data'))) > 1000: compressed = convert.bytes2zip(kwargs['data']) headers['content-encoding'] = 'gzip' kwargs['data'] = compressed del kwargs['zip'] except Exception as e: Log.error(u"Request setup failure on {{url}}", url=url, cause=e) errors = [] for r in range(retry.times): if r: Till(seconds=retry.sleep).wait() try: DEBUG and Log.note(u"http {{method|upper}} to {{url}}", method=method, url=text_type(url)) request_count += 1 return session.request(method=method, headers=headers, url=str(url), **kwargs) except Exception as e: e = Except.wrap(e) if retry['http'] and str(url).startswith( "https://" ) and "EOF occurred in violation of protocol" in e: url = URL("http://" + str(url)[8:]) Log.note( "Changed {{url}} to http due to SSL EOF violation.", url=str(url)) errors.append(e) if " Read timed out." in errors[0]: Log.error( u"Tried {{times}} times: Timeout failure (timeout was {{timeout}}", timeout=kwargs['timeout'], times=retry.times, cause=errors[0]) else: Log.error(u"Tried {{times}} times: Request failure of {{url}}", url=url, times=retry.times, cause=errors[0])
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([ s.value == None and s.aggregate not in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and not select[0].value or select[0].value == "*": FromES = wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter( jx_expression(query.where).to_esfilter()) } }, "sort": query.sort, "size": 1 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(query.where.to_esfilter()) } }, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1 and not select[0].value or select[0].value == "*": # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube( select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Data(meta={"esquery": FromES}, data=cube)
def iteritems(self): # LOW LEVEL ITERATION, NO WRAPPING d = self._internal_dict return ((k, wrap(v)) for k, v in iteritems(d))
def items(self): d = self._internal_dict return [(k, wrap(v)) for k, v in d.items() if v != None or isinstance(v, Mapping)]
def pe_filter(filter, data, depth): """ PARTIAL EVALUATE THE filter BASED ON data GIVEN """ if filter is TRUE_FILTER: return True if filter is FALSE_FILTER: return False filter = wrap(filter) if filter["and"]: result = True output = FlatList() for a in filter[u"and"]: f = pe_filter(a, data, depth) if f is False: result = False elif f is not True: output.append(f) if result and output: return {"and": output} else: return result elif filter["or"]: output = FlatList() for o in filter[u"or"]: f = pe_filter(o, data, depth) if f is True: return True elif f is not False: output.append(f) if output: return {"or": output} else: return False elif filter["not"]: f = pe_filter(filter["not"], data, depth) if f is True: return False elif f is False: return True else: return {"not": f} elif filter.term or filter.eq: eq = coalesce(filter.term, filter.eq) result = True output = {} for col, val in eq.items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d != val: result = False else: output[rest] = val if result and output: return {"term": output} else: return result elif filter.equal: a, b = filter["equal"] first_a, rest_a = parse_field(a, data, depth) first_b, rest_b = parse_field(b, data, depth) val_a = data[first_a] val_b = data[first_b] if not rest_a: if not rest_b: if val_a != val_b: return False else: return True else: return {"term": {rest_b: val_a}} else: if not rest_b: return {"term": {rest_a: val_b}} else: return {"equal": [rest_a, rest_b]} elif filter.terms: result = True output = {} for col, vals in filter["terms"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d not in vals: result = False else: output[rest] = vals if result and output: return {"terms": output} else: return result elif filter.range: result = True output = {} for col, ranges in filter["range"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: for sign, val in ranges.items(): if sign in ("gt", ">") and d <= val: result = False if sign == "gte" and d < val: result = False if sign == "lte" and d > val: result = False if sign == "lt" and d >= val: result = False else: output[rest] = ranges if result and output: return {"range": output} else: return result elif filter.missing: if isinstance(filter.missing, basestring): field = filter["missing"] else: field = filter["missing"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d == None: return True return False else: return {"missing": rest} elif filter.prefix: result = True output = {} for col, val in filter["prefix"].items(): first, rest = parse_field(col, data, depth) d = data[first] if not rest: if d == None or not d.startswith(val): result = False else: output[rest] = val if result and output: return {"prefix": output} else: return result elif filter.exists: if isinstance(filter["exists"], basestring): field = filter["exists"] else: field = filter["exists"]["field"] first, rest = parse_field(field, data, depth) d = data[first] if not rest: if d != None: return True return False else: return {"exists": rest} else: Log.error(u"Can not interpret esfilter: {{esfilter}}", {u"esfilter": filter})
def __iter__(self): temp = [wrap(v) for v in _get_list(self)] return iter(temp)
def filter(self, _filter): return FlatList(vals=[ unwrap(u) for u in (wrap(v) for v in _get_list(self)) if _filter(u) ])
def selector(d): output = Data() for n, p in push_and_pull: output[n] = unwraplist(p(wrap(d))) return unwrap(output)
def append_query(self, es_query, start): self.start = start domain = self.domain field = self.edge.value if isinstance(field, Variable): key = domain.key if isinstance(key, (tuple, list)) and len(key) == 1: key = key[0] include = [p[key] for p in domain.partitions] if self.edge.allowNulls: return wrap({ "aggs": { "_match": set_default( { "terms": { "field": field.var, "size": self.limit, "include": include, "order": { "_term": self.sorted } if self.sorted else None } }, es_query), "_missing": set_default( { "filter": { "or": [ field.missing().to_esfilter(), { "not": { "terms": { field.var: include } } } ] } }, es_query), } }) else: return wrap({ "aggs": { "_match": set_default( { "terms": { "field": field.var, "size": self.limit, "include": include, "order": { "_term": self.sorted } if self.sorted else None } }, es_query) } }) else: include = [p[domain.key] for p in domain.partitions] if self.edge.allowNulls: return wrap({ "aggs": { "_match": set_default( { "terms": { "script_field": field.to_ruby(), "size": self.limit, "include": include } }, es_query), "_missing": set_default( { "filter": { "or": [ field.missing().to_esfilter(), NotOp( "not", InOp("in", [ field, Literal("literal", include) ])).to_esfilter() ] } }, es_query), } }) else: return wrap({ "aggs": { "_match": set_default( { "terms": { "script_field": field.to_ruby(), "size": self.limit, "include": include } }, es_query) } })
from mo_dots import wrap, Data, listwrap, is_data, FlatList from mo_future import first from mo_kwargs import override from mo_logs import Log from pyLibrary.sql import SQL_UPDATE, SQL_SET from pyLibrary.sql.sqlite import sql_query, sql_create, sql_insert, quote_column, sql_eq, Sqlite ROOT_USER = wrap({"_id": 1}) VERSION_TABLE = "security.version" GROUP_TABLE = "security.groups" PERMISSION_TABLE = "security.permissions" RESOURCE_TABLE = "security.resources" TABLE_OPERATIONS = ["insert", "update", "from"] CREATE_TABLE = {"_id": 100, "table": ".", "operation": "insert", "owner": 1} class Permissions: @override def __init__(self, db, kwargs): if is_data(db): self.db = Sqlite(db) elif isinstance(db, Sqlite): self.db = db else: Log.error("Bad db parameter") if not self.db.about(PERMISSION_TABLE): self.setup() self.next_id = id_generator(self.db) def setup(self):
def start(cls, settings=None): """ RUN ME FIRST TO SETUP THE THREADED LOGGING http://victorlin.me/2012/08/good-logging-practice-in-python/ log - LIST OF PARAMETERS FOR LOGGER(S) trace - SHOW MORE DETAILS IN EVERY LOG LINE (default False) cprofile - True==ENABLE THE C-PROFILER THAT COMES WITH PYTHON (default False) USE THE LONG FORM TO SET THE FILENAME {"enabled": True, "filename": "cprofile.tab"} profile - True==ENABLE pyLibrary SIMPLE PROFILING (default False) (eg with Profiler("some description"):) USE THE LONG FORM TO SET FILENAME {"enabled": True, "filename": "profile.tab"} constants - UPDATE MODULE CONSTANTS AT STARTUP (PRIMARILY INTENDED TO CHANGE DEBUG STATE) """ global _Thread if not settings: return settings = wrap(settings) Log.stop() cls.settings = settings cls.trace = coalesce(settings.trace, False) if cls.trace: from mo_threads import Thread as _Thread _ = _Thread if settings.cprofile is False: settings.cprofile = {"enabled": False} elif settings.cprofile is True or (isinstance( settings.cprofile, Mapping) and settings.cprofile.enabled): if isinstance(settings.cprofile, bool): settings.cprofile = { "enabled": True, "filename": "cprofile.tab" } import cProfile cls.cprofiler = cProfile.Profile() cls.cprofiler.enable() if settings.profile is True or (isinstance(settings.profile, Mapping) and settings.profile.enabled): from mo_logs import profiles if isinstance(settings.profile, bool): profiles.ON = True settings.profile = {"enabled": True, "filename": "profile.tab"} if settings.profile.enabled: profiles.ON = True if settings.constants: constants.set(settings.constants) if settings.log: cls.logging_multi = StructuredLogger_usingMulti() from mo_logs.log_usingThread import StructuredLogger_usingThread cls.main_log = StructuredLogger_usingThread(cls.logging_multi) for log in listwrap(settings.log): Log.add_log(Log.new_instance(log)) if settings.cprofile.enabled == True: Log.alert("cprofiling is enabled, writing to {{filename}}", filename=os.path.abspath(settings.cprofile.filename))
def get_table(self, name): if name == "meta.columns": return self.meta.columns with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == name])
# # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. # # Author: Kyle Lahnakoski ([email protected]) # from __future__ import absolute_import, division, unicode_literals from jx_base.expressions import NULL from mo_dots import wrap from tests.test_jx import BaseTestCase, TEST_TABLE lots_of_data = wrap([{"a": i} for i in range(30)]) class TestSetOps(BaseTestCase): def test_length(self): test = { "data": [ {"v": "1"}, {"v": "22"}, {"v": "333"}, {"v": "4444"}, {"v": "55555"} ], "query": { "from": TEST_TABLE,
def query(self, query): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not startswith_field(query['from'], self.name): Log.error("Expecting table, or some nested table") frum, query['from'] = query['from'], self query = QueryOp.wrap(query, self.columns) # TYPE CONFLICTS MUST NOW BE RESOLVED DURING # TYPE-SPECIFIC QUERY NORMALIZATION # vars_ = query.vars(exclude_select=True) # type_map = { # v: c.es_column # for v in vars_ # if v in self.columns and len([c for c in self.columns[v] if c.type != "nested"]) == 1 # for c in self.columns[v] # if c.type != "nested" # } # # sql_query = query.map(type_map) query = query new_table = "temp_" + unique_name() if query.format == "container": create_table = "CREATE TABLE " + quote_table(new_table) + " AS " else: create_table = "" if query.groupby: op, index_to_columns = self._groupby_op(query, frum) command = create_table + op elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, frum) command = create_table + op else: op = self._set_op(query, frum) return op if query.sort: command += "\nORDER BY " + ",\n".join( "(" + sql[t] + ") IS NULL" + (" DESC" if s.sort == -1 else "") + ",\n" + sql[t] + (" DESC" if s.sort == -1 else "") for s, sql in [(s, s.value.to_sql(self)[0].sql) for s in query.sort] for t in "bns" if sql[t]) result = self.db.query(command) column_names = query.edges.name + query.groupby.name + listwrap( query.select).name if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap( s.pull(result.data[0])) return Data(data=unwrap(data), meta={"format": "cube"}) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain( partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif isinstance(e.value, TupleOp): pulls = jx.sort([ c for c in index_to_columns.values() if c.push_name == e.name ], "push_child").pull parts = [ tuple(p(d) for p in pulls) for d in result.data ] domain = SimpleSetDomain( partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append( Data(name=e.name, allowNulls=allowNulls, domain=domain)) zeros = [ 0 if s.aggregate == "count" and index_to_columns[si].push_child == "." else Data for si, s in enumerate(listwrap(query.select)) ] data = { s.name: Matrix(dims=dims, zeros=zeros[si]) for si, s in enumerate(listwrap(query.select)) } if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()}) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain( partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif isinstance(e.value, TupleOp): pulls = jx.sort([ c for c in index_to_columns.values() if c.push_name == e.name ], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = zip(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append( Data(name=e.name, allowNulls=allowNulls, domain=domain)) zeros = [ 0 if s.aggregate == "count" and index_to_columns[si].push_child == "." else Data for si, s in enumerate(listwrap(query.select)) ] data_cubes = { s.name: Matrix(dims=dims, zeros=zeros[si]) for si, s in enumerate(listwrap(query.select)) } r2c = index_to_coordinate( dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull( row) if isinstance(query.select, list): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data(meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()}) elif query.format == "table" or (not query.format and query.groupby): data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[ s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data(meta={"format": "table"}, header=column_names, data=data) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any( listwrap(query.select).aggregate): if isinstance(query.select, list): data = Data() for c in index_to_columns.values(): if c.push_child == ".": data[c.push_name] = c.pull(result.data[0]) else: data[c.push_name][c.push_child] = c.pull( result.data[0]) output = Data(meta={"format": "value"}, data=data) else: data = Data() for s in index_to_columns.values(): data[s.push_child] = s.pull(result.data[0]) output = Data(meta={"format": "value"}, data=unwrap(data)) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[ c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data(meta={"format": "list"}, data=data) else: Log.error("unknown format {{format}}", format=query.format) return output
def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[ Date(t).format() for t in wrap(old_columns).last_updated ]) self.todo.extend(old_columns) else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={ "table": column.es_index, "column": column.es_column }, silent=not DEBUG): if column.es_index in self.index_does_not_exist: DEBUG and Log.note( "{{column.es_column}} does not exist", column=column) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) continue if column.jx_type in STRUCT or split_field( column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note( "{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now( ) - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note( "{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now() - Date(column.last_updated)).seconds) continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX) ) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) else: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e)
def __init__(self, dim, parent, jx): dim = wrap(dim) self.name = dim.name self.parent = coalesce(parent) self.full_name = join_field( split_field(self.parent.full_name) + [self.name]) self.edges = None # FOR NOW dot.set_default(self, dim) self.where = dim.where self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, jx.settings.index) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Data() for e in listwrap(dim.edges): new_e = Dimension(e, self, jx) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif isinstance(fields, Mapping): self.fields = wrap(fields) edges = wrap([{ "name": k, "value": v, "allowNulls": False } for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{ "name": f, "value": f, "index": i, "allowNulls": False } for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if self.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH jx.get_columns() with Timer("Get parts of {{name}}", {"name": self.name}): parts = jx.query({ "from": self.index, "select": { "name": "count", "aggregate": "count" }, "edges": edges, "where": self.where, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name=self.name, num=len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Data(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not isinstance(a, list): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts(temp, dim.path(d.getEnd(d.partitions[i])), count, 0) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif isinstance(fields, Mapping): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = FlatList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "where": { "and": [{ "term": { e.value: g[e.name] } } for e in edges] }, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": { "term": { edges[0].value: d.partitions[i].value } }, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values( )[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if isinstance(fields, Mapping): output = Data() for e, v in transpose(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": { "term": { edges[0].value: d.partitions[i].value } }, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "where": { "and": [{ "term": { edges[0].value: d.partitions[i].value } }, { "term": { edges[1].value: d2.partitions[j].value } }] }, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
def json2value(json_string, params=Null, flexible=False, leaves=False): """ :param json_string: THE JSON :param params: STANDARD JSON PARAMS :param flexible: REMOVE COMMENTS :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED :return: Python value """ if not isinstance(json_string, text_type): Log.error("only unicode json accepted") try: if flexible: # REMOVE """COMMENTS""", # COMMENTS, //COMMENTS, AND \n \r # DERIVED FROM https://github.com/jeads/datasource/blob/master/datasource/bases/BaseHub.py# L58 json_string = re.sub(r"\"\"\".*?\"\"\"", r"\n", json_string, flags=re.MULTILINE) json_string = "\n".join( remove_line_comment(l) for l in json_string.split("\n")) # ALLOW DICTIONARY'S NAME:VALUE LIST TO END WITH COMMA json_string = re.sub(r",\s*\}", r"}", json_string) # ALLOW LISTS TO END WITH COMMA json_string = re.sub(r",\s*\]", r"]", json_string) if params: # LOOKUP REFERENCES json_string = expand_template(json_string, params) try: value = wrap(json_decoder(text_type(json_string))) except Exception as e: Log.error("can not decode\n{{content}}", content=json_string, cause=e) if leaves: value = wrap_leaves(value) return value except Exception as e: e = Except.wrap(e) if not json_string.strip(): Log.error("JSON string is only whitespace") c = e while "Expecting '" in c.cause and "' delimiter: line" in c.cause: c = c.cause if "Expecting '" in c and "' delimiter: line" in c: line_index = int(strings.between(c.message, " line ", " column ")) - 1 column = int(strings.between(c.message, " column ", " ")) - 1 line = json_string.split("\n")[line_index].replace("\t", " ") if column > 20: sample = "..." + line[column - 20:] pointer = " " + (" " * 20) + "^" else: sample = line pointer = (" " * column) + "^" if len(sample) > 43: sample = sample[:43] + "..." Log.error(CAN_NOT_DECODE_JSON + " at:\n\t{{sample}}\n\t{{pointer}}\n", sample=sample, pointer=pointer) base_str = strings.limit(json_string, 1000).encode('utf8') hexx_str = bytes2hex(base_str, " ") try: char_str = " " + " ".join( (c.decode("latin1") if ord(c) >= 32 else ".") for c in base_str) except Exception: char_str = " " Log.error(CAN_NOT_DECODE_JSON + ":\n{{char_str}}\n{{hexx_str}}\n", char_str=char_str, hexx_str=hexx_str, cause=e)
def pop(self, index=None): if index is None: return wrap(_get_list(self).pop()) else: return wrap(_get_list(self).pop(index))
def to_sql(self, schema, not_null=False, boolean=False): return wrap([{ "name": ".", "sql": SQLang[t].to_sql(schema)[0].sql } for t in self.terms])
def __deepcopy__(self, memo): d = _get_list(self) return wrap(deepcopy(d, memo))
def copy(self, keys, source, sample_only_filter=None, sample_size=None, done_copy=None): """ :param keys: THE KEYS TO LOAD FROM source :param source: THE SOURCE (USUALLY S3 BUCKET) :param sample_only_filter: SOME FILTER, IN CASE YOU DO NOT WANT TO SEND EVERYTHING :param sample_size: FOR RANDOM SAMPLE OF THE source DATA :param done_copy: CALLBACK, ADDED TO queue, TO FINISH THE TRANSACTION :return: LIST OF SUB-keys PUSHED INTO ES """ num_keys = 0 queue = None pending = [] # FOR WHEN WE DO NOT HAVE QUEUE YET for key in keys: timer = Timer("Process {{key}}", param={"key": key}, silent=not DEBUG) try: with timer: for rownum, line in enumerate(source.read_lines(strip_extension(key))): if not line: continue if rownum > 0 and rownum % 1000 == 0: Log.note("Ingested {{num}} records from {{key}} in bucket {{bucket}}", num=rownum, key=key, bucket=source.name) insert_me, please_stop = fix(key, rownum, line, source, sample_only_filter, sample_size) if insert_me == None: continue value = insert_me['value'] if '_id' not in value: Log.warning("expecting an _id in all S3 records. If missing, there can be duplicates") if queue == None: queue = self._get_queue(insert_me) if queue == None: pending.append(insert_me) if len(pending) > 1000: if done_copy: done_copy() Log.error("first 1000 (key={{key}}) records for {{alias}} have no indication what index to put data", key=tuple(keys)[0], alias=self.settings.index) continue elif queue is DATA_TOO_OLD: break if pending: queue.extend(pending) pending = [] num_keys += 1 queue.add(insert_me) if please_stop: break except Exception as e: if KEY_IS_WRONG_FORMAT in e: Log.warning("Could not process {{key}} because bad format. Never trying again.", key=key, cause=e) pass elif CAN_NOT_DECODE_JSON in e: Log.warning("Could not process {{key}} because of bad JSON. Never trying again.", key=key, cause=e) pass else: Log.warning("Could not process {{key}} after {{duration|round(places=2)}}seconds", key=key, duration=timer.duration.seconds, cause=e) done_copy = None if done_copy: if queue == None: done_copy() elif queue is DATA_TOO_OLD: done_copy() else: queue.add(done_copy) if [p for p in pending if wrap(p).value.task.state not in ('failed', 'exception')]: Log.error("Did not find an index for {{alias}} to place the data for key={{key}}", key=tuple(keys)[0], alias=self.settings.index) Log.note("{{num}} keys from {{key|json}} added", num=num_keys, key=keys) return num_keys
def update(self, command): """ :param command: EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT """ command = wrap(command) # REJECT DEEP UPDATES touched_columns = command.set.keys() | set(listwrap(command['clear'])) for c in self.get_leaves(): if c.name in touched_columns and c.nested_path and len(c.name) > len(c.nested_path[0]): Log.error("Deep update not supported") # ADD NEW COLUMNS where = jx_expression(command.where) _vars = where.vars() _map = { v: c.es_column for v in _vars for c in self.columns.get(v, Null) if c.type not in STRUCT } where_sql = where.map(_map).to_sql() new_columns = set(command.set.keys()) - set(self.columns.keys()) for new_column_name in new_columns: nested_value = command.set[new_column_name] ctype = get_type(nested_value) column = Column( names={".": new_column_name}, type=ctype, es_index=self.sf.fact, es_column=typed_column(new_column_name, ctype) ) self.add_column(column) # UPDATE THE NESTED VALUES for nested_column_name, nested_value in command.set.items(): if get_type(nested_value) == "nested": nested_table_name = concat_field(self.sf.fact, nested_column_name) nested_table = nested_tables[nested_column_name] self_primary_key = sql_list(quote_column(c.es_column) for u in self.uid for c in self.columns[u]) extra_key_name = UID_PREFIX + "id" + text_type(len(self.uid)) extra_key = [e for e in nested_table.columns[extra_key_name]][0] sql_command = ( "DELETE" + SQL_FROM + quote_column(nested_table.name) + SQL_WHERE + "EXISTS (" + "\nSELECT 1 " + SQL_FROM + quote_column(nested_table.name) + " n" + SQL_INNER_JOIN + "(" + SQL_SELECT + self_primary_key + SQL_FROM + quote_column(self.sf.fact) + SQL_WHERE + where_sql + "\n) t ON " + SQL_AND.join( "t." + quote_column(c.es_column) + " = n." + quote_column(c.es_column) for u in self.uid for c in self.columns[u] ) + ")" ) self.db.execute(sql_command) # INSERT NEW RECORDS if not nested_value: continue doc_collection = {} for d in listwrap(nested_value): nested_table.flatten(d, Data(), doc_collection, path=nested_column_name) prefix = "INSERT INTO " + quote_column(nested_table.name) + sql_iso(sql_list( [self_primary_key] + [quote_column(extra_key)] + [ quote_column(c.es_column) for c in doc_collection.get(".", Null).active_columns ] )) # BUILD THE PARENT TABLES parent = ( SQL_SELECT + self_primary_key + SQL_FROM + quote_column(self.sf.fact) + SQL_WHERE + jx_expression(command.where).to_sql() ) # BUILD THE RECORDS children = SQL_UNION_ALL.join( SQL_SELECT + quote_value(i) + " " + quote_column(extra_key.es_column) + "," + sql_list( quote_value(row[c.name]) + " " + quote_column(c.es_column) for c in doc_collection.get(".", Null).active_columns ) for i, row in enumerate(doc_collection.get(".", Null).rows) ) sql_command = ( prefix + SQL_SELECT + sql_list( [join_column("p", c.es_column) for u in self.uid for c in self.columns[u]] + [join_column("c", extra_key)] + [join_column("c", c.es_column) for c in doc_collection.get(".", Null).active_columns] ) + SQL_FROM + sql_iso(parent) + " p" + SQL_INNER_JOIN + sql_iso(children) + " c" + " ON " + SQL_TRUE ) self.db.execute(sql_command) # THE CHILD COLUMNS COULD HAVE EXPANDED # ADD COLUMNS TO SELF for n, cs in nested_table.columns.items(): for c in cs: column = Column( names={".": c.name}, type=c.type, es_index=c.es_index, es_column=c.es_column, nested_path=[nested_column_name] + c.nested_path ) if c.name not in self.columns: self.columns[column.name] = {column} elif c.type not in [c.type for c in self.columns[c.name]]: self.columns[column.name].add(column) command = ( "UPDATE " + quote_column(self.sf.fact) + " SET " + sql_list( [ quote_column(c) + "=" + quote_value(get_if_type(v, c.type)) for k, v in command.set.items() if get_type(v) != "nested" for c in self.columns[k] if c.type != "nested" and len(c.nested_path) == 1 ] + [ quote_column(c) + "=" + SQL_NULL for k in listwrap(command['clear']) if k in self.columns for c in self.columns[k] if c.type != "nested" and len(c.nested_path) == 1 ] ) + SQL_WHERE + where_sql ) self.db.execute(command)