def quote_value(value): """ convert values to mysql code for the same mostly delegate directly to the mysql lib, but some exceptions exist """ try: if value == None: return SQL_NULL elif isinstance(value, SQL): return quote_sql(value.template, value.param) elif is_text(value): return SQL("'" + "".join(ESCAPE_DCT.get(c, c) for c in value) + "'") elif is_data(value): return quote_value(json_encode(value)) elif is_number(value): return SQL(text_type(value)) elif isinstance(value, datetime): return SQL("str_to_date('" + value.strftime("%Y%m%d%H%M%S.%f") + "', '%Y%m%d%H%i%s.%f')") elif isinstance(value, Date): return SQL("str_to_date('" + value.format("%Y%m%d%H%M%S.%f") + "', '%Y%m%d%H%i%s.%f')") elif hasattr(value, '__iter__'): return quote_value(json_encode(value)) else: return quote_value(text_type(value)) except Exception as e: Log.error("problem quoting SQL {{value}}", value=repr(value), cause=e)
def insert_list(self, table_name, records): if not records: return columns = set() for r in records: columns |= set(r.keys()) columns = jx.sort(columns) try: self.execute( "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}", {"ids": self.quote_column([r["_id"] for r in records])} ) command = ( "INSERT INTO " + self.quote_column(table_name) + "(" + ",".join([self.quote_column(k) for k in columns]) + ") VALUES " + ",\n".join([ sql_iso(",".join([self.quote_value(r.get(k, None)) for k in columns])) for r in records ]) ) self.execute(command) except Exception as e: Log.error("problem with insert", e)
def jx_sort_to_es_sort(sort, schema): if not sort: return [] output = [] for s in sort: if isinstance(s.value, Variable): cols = schema.leaves(s.value.var) if s.sort == -1: types = OBJECT, STRING, NUMBER, BOOLEAN else: types = BOOLEAN, NUMBER, STRING, OBJECT for type in types: for c in cols: if c.jx_type == type: if s.sort == -1: output.append({c.es_column: "desc"}) else: output.append(c.es_column) else: from mo_logs import Log Log.error("do not know how to handle") return output
def _open(self): """ DO NOT USE THIS UNLESS YOU close() FIRST""" try: self.db = connect( host=self.settings.host, port=self.settings.port, user=coalesce(self.settings.username, self.settings.user), passwd=coalesce(self.settings.password, self.settings.passwd), db=coalesce(self.settings.schema, self.settings.db), read_timeout=coalesce(self.settings.read_timeout, (EXECUTE_TIMEOUT / 1000) - 10 if EXECUTE_TIMEOUT else None, 5*60), charset=u"utf8", use_unicode=True, ssl=coalesce(self.settings.ssl, None), cursorclass=cursors.SSCursor ) except Exception as e: if self.settings.host.find("://") == -1: Log.error( u"Failure to connect to {{host}}:{{port}}", host=self.settings.host, port=self.settings.port, cause=e ) else: Log.error(u"Failure to connect. PROTOCOL PREFIX IS PROBABLY BAD", e) self.cursor = None self.partial_rollback = False self.transaction_level = 0 self.backlog = [] # accumulate the write commands so they are sent at once if self.readonly: self.begin()
def execute( self, command, param=None, retry=True # IF command FAILS, JUST THROW ERROR ): if param: command = expand_template(command, self.quote_param(param)) output = None done = False while not done: try: with self.locker: if not self.connection: self._connect() with Closer(self.connection.cursor()) as curs: curs.execute(command) if curs.rowcount >= 0: output = curs.fetchall() self.connection.commit() done = True except Exception as e: with suppress_exception: self.connection.rollback() # TODO: FIGURE OUT WHY rollback() DOES NOT HELP self.connection.close() self.connection = None self._connect() if not retry: Log.error("Problem with command:\n{{command|indent}}", command= command, cause=e) return output
def problem_serializing(value, e=None): """ THROW ERROR ABOUT SERIALIZING """ from mo_logs import Log try: typename = type(value).__name__ except Exception: typename = "<error getting name>" try: rep = text_type(repr(value)) except Exception as _: rep = None if rep == None: Log.error( "Problem turning value of type {{type}} to json", type=typename, cause=e ) else: Log.error( "Problem turning value ({{value}}) of type {{type}} to json", value=rep, type=typename, cause=e )
def pypy_json_encode(value, pretty=False): """ pypy DOES NOT OPTIMIZE GENERATOR CODE WELL """ global _dealing_with_problem if pretty: return pretty_json(value) try: _buffer = UnicodeBuilder(2048) _value2json(value, _buffer) output = _buffer.build() return output except Exception as e: # THE PRETTY JSON WILL PROVIDE MORE DETAIL ABOUT THE SERIALIZATION CONCERNS from mo_logs import Log if _dealing_with_problem: Log.error("Serialization of JSON problems", e) else: Log.warning("Serialization of JSON problems", e) _dealing_with_problem = True try: return pretty_json(value) except Exception as f: Log.error("problem serializing object", f) finally: _dealing_with_problem = False
def _dict2json(value, sub_schema, path, net_new_properties, buffer): prefix = '{' for k, v in sort_using_key(value.items(), lambda r: r[0]): if v == None or v == '': continue append(buffer, prefix) prefix = COMMA if is_binary(k): k = utf82unicode(k) if not is_text(k): Log.error("Expecting property name to be a string") if k not in sub_schema: sub_schema[k] = {} net_new_properties.append(path + [k]) append(buffer, encode_basestring(encode_property(k))) append(buffer, COLON) typed_encode(v, sub_schema[k], path + [k], net_new_properties, buffer) if prefix is COMMA: append(buffer, COMMA) append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '1}') else: append(buffer, '{') append(buffer, QUOTED_EXISTS_TYPE) append(buffer, '1}')
def Stats2ZeroMoment(stats): # MODIFIED FROM http://statsmodels.sourceforge.net/devel/_modules/statsmodels/stats/moment_helpers.html # ADDED count mc0, mc1, mc2, skew, kurt = stats.count, coalesce(stats.mean, 0), coalesce(stats.variance, 0), coalesce(stats.skew, 0), coalesce(stats.kurtosis, 0) mz0 = mc0 mz1 = mc1 * mc0 mz2 = (mc2 + mc1 * mc1) * mc0 mc3 = coalesce(skew, 0) * (mc2 ** 1.5) # 3rd central moment mz3 = (mc3 + 3 * mc1 * mc2 + mc1 ** 3) * mc0 # 3rd non-central moment mc4 = (coalesce(kurt, 0) + 3.0) * (mc2 ** 2.0) # 4th central moment mz4 = (mc4 + 4 * mc1 * mc3 + 6 * mc1 * mc1 * mc2 + mc1 ** 4) * mc0 m = ZeroMoment(mz0, mz1, mz2, mz3, mz4) if DEBUG: from mo_testing.fuzzytestcase import assertAlmostEqualValue globals()["DEBUG"] = False try: v = ZeroMoment2Stats(m) assertAlmostEqualValue(v.count, stats.count, places=10) assertAlmostEqualValue(v.mean, stats.mean, places=10) assertAlmostEqualValue(v.variance, stats.variance, places=10) assertAlmostEqualValue(v.skew, stats.skew, places=10) assertAlmostEqualValue(v.kurtosis, stats.kurtosis, places=10) except Exception as e: v = ZeroMoment2Stats(m) Log.error("programmer error") globals()["DEBUG"] = True return m
def __init__(self, name, params, cwd=None, env=None, debug=False, shell=False, bufsize=-1): self.name = name self.service_stopped = Signal("stopped signal for " + strings.quote(name)) self.stdin = Queue("stdin for process " + strings.quote(name), silent=True) self.stdout = Queue("stdout for process " + strings.quote(name), silent=True) self.stderr = Queue("stderr for process " + strings.quote(name), silent=True) try: self.debug = debug or DEBUG self.service = service = subprocess.Popen( params, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=bufsize, cwd=cwd if isinstance(cwd, (basestring, NullType, NoneType)) else cwd.abspath, env=unwrap(set_default(env, os.environ)), shell=shell ) self.please_stop = Signal() self.please_stop.on_go(self._kill) self.thread_locker = Lock() self.children = [ Thread.run(self.name + " stdin", self._writer, service.stdin, self.stdin, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " stdout", self._reader, "stdout", service.stdout, self.stdout, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " stderr", self._reader, "stderr", service.stderr, self.stderr, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " waiter", self._monitor, parent_thread=self), ] except Exception as e: Log.error("Can not call", e) if self.debug: Log.note("{{process}} START: {{command}}", process=self.name, command=" ".join(map(strings.quote, params)))
def output(*args, **kwargs): if len(args): if len(kwargs.keys()): Log.error("Not allowed to use both args and kwargs") return self._execute({item: args}) else: return self._execute({item: kwargs})
def filter(data, where): """ where - a function that accepts (record, rownum, rows) and returns boolean """ if len(data) == 0 or where == None or where == TRUE: return data if isinstance(data, Container): return data.filter(where) if is_container(data): temp = jx_expression_to_function(where) dd = wrap(data) return wrap([unwrap(d) for i, d in enumerate(data) if temp(wrap(d), i, dd)]) else: Log.error( "Do not know how to handle type {{type}}", type=data.__class__.__name__ ) try: return drill_filter(where, data) except Exception as _: # WOW! THIS IS INEFFICIENT! return wrap( [unwrap(d) for d in drill_filter(where, [DataObject(d) for d in data])] )
def parse_field(fieldname, data, depth): """ RETURN (first, rest) OF fieldname """ col = split_field(fieldname) d = data for i, c in enumerate(col): try: d = d[c] except Exception as e: Log.error("{{name}} does not exist", name=fieldname) if is_list(d) and len(col) > 1: if len(primary_column) <= depth + i: primary_nested.append(True) primary_column.append(c) primary_branch.append(d) elif primary_nested[depth] and primary_column[depth + i] != c: Log.error("only one branch of tree allowed") else: primary_nested[depth + i] = True primary_column[depth + i] = c primary_branch[depth + i] = d return c, join_field(col[i + 1 :]) else: if len(primary_column) <= depth + i: primary_nested.append(False) primary_column.append(c) primary_branch.append([d]) return fieldname, None
def _select(template, data, fields, depth): output = FlatList() deep_path = [] deep_fields = UniqueIndex(["name"]) for d in data: if d.__class__ is Data: Log.error("programmer error, _select can not handle Data, only dict") record = template.copy() children = None for f in fields: index, c = _select_deep(d, f, depth, record) children = c if children is None else children if index: path = f.value[0:index:] if not deep_fields[f]: deep_fields.add(f) # KEEP TRACK OF WHICH FIELDS NEED DEEPER SELECT short = MIN([len(deep_path), len(path)]) if path[:short:] != deep_path[:short:]: Log.error("Dangerous to select into more than one branch at time") if len(deep_path) < len(path): deep_path = path if not children: output.append(record) else: output.extend(_select(record, children, deep_fields, depth + 1)) return output
def _select_deep(v, field, depth, record): """ field = {"name":name, "value":["attribute", "path"]} r[field.name]=v[field.value], BUT WE MUST DEAL WITH POSSIBLE LIST IN field.value PATH """ if hasattr(field.value, "__call__"): try: record[field.name] = field.value(wrap(v)) except Exception as e: record[field.name] = None return 0, None for i, f in enumerate(field.value[depth : len(field.value) - 1 :]): v = v.get(f) if v is None: return 0, None if is_list(v): return depth + i + 1, v f = field.value.last() try: if not f: # NO NAME FIELD INDICATES SELECT VALUE record[field.name] = v else: record[field.name] = v.get(f) except Exception as e: Log.error( "{{value}} does not have {{field}} property", value=v, field=f, cause=e ) return 0, None
def _decode_token(index, c, full_path, path, name2index, destination, expected_vars): if c == b'{': if not expected_vars: index = jump_to_end(index, c) value = None elif expected_vars[0] == ".": json.mark(index-1) index = jump_to_end(index, c) value = json_decoder(json.release(index).decode("utf8")) else: count = 0 for v, i in _decode_object(index, full_path, path, name2index, destination, expected_vars=expected_vars): index = i value = v count += 1 if count != 1: Log.error("Expecting object, nothing nested") elif c == b'[': if not expected_vars: index = jump_to_end(index, c) value = None else: json.mark(index - 1) index = jump_to_end(index, c) value = json_decoder(json.release(index).decode("utf8")) else: if expected_vars and expected_vars[0] == ".": value, index = simple_token(index, c) else: index = jump_to_end(index, c) value = None return value, index
def tuple(data, field_name): """ RETURN LIST OF TUPLES """ if isinstance(data, Cube): Log.error("not supported yet") if isinstance(data, FlatList): Log.error("not supported yet") if is_data(field_name) and "value" in field_name: # SIMPLIFY {"value":value} AS STRING field_name = field_name["value"] # SIMPLE PYTHON ITERABLE ASSUMED if is_text(field_name): if len(split_field(field_name)) == 1: return [(d[field_name],) for d in data] else: path = split_field(field_name) output = [] flat_list._tuple1(data, path, 0, output) return output elif is_list(field_name): paths = [_select_a_field(f) for f in field_name] output = FlatList() _tuple((), unwrap(data), paths, 0, output) return output else: paths = [_select_a_field(field_name)] output = FlatList() _tuple((), data, paths, 0, output) return output
def simple_token(index, c): if c == b'"': json.mark(index - 1) while True: c = json[index] index += 1 if c == b"\\": index += 1 elif c == b'"': break return json_decoder(json.release(index).decode("utf8")), index elif c in b"{[": Log.error("Expecting a primitive value") elif c == b"t" and json.slice(index, index + 3) == "rue": return True, index + 3 elif c == b"n" and json.slice(index, index + 3) == "ull": return None, index + 3 elif c == b"f" and json.slice(index, index + 4) == "alse": return False, index + 4 else: json.mark(index-1) while True: c = json[index] if c in b',]}': break index += 1 return float(json.release(index)), index
def _decode(index, parent_path, path, name2index, expected_vars=NO_VARS): c, index = skip_whitespace(index) if not path: if c != b"[": # TREAT VALUE AS SINGLE-VALUE ARRAY yield _decode_token(index, c, parent_path, path, name2index, None, expected_vars) else: c, index = skip_whitespace(index) if c == b']': return # EMPTY ARRAY while True: value, index = _decode_token(index, c, parent_path, path, name2index, None, expected_vars) c, index = skip_whitespace(index) if c == b']': yield value, index return elif c == b',': c, index = skip_whitespace(index) yield value, index else: if c != b'{': Log.error("Expecting all objects to at least have {{path}}", path=path[0]) for j, i in _decode_object(index, parent_path, path, name2index, expected_vars=expected_vars): yield j, i
def replacePrefix(value, prefix, new_prefix): try: if value.startswith(prefix): return new_prefix+value[len(prefix)::] return value except Exception as e: Log.error("can not replace prefix", e)
def __init__(self, logger): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True) self.logger = logger def worker(logger, please_stop): try: while not please_stop: logs = self.queue.pop_all() if not logs: (Till(seconds=1) | please_stop).wait() continue for log in logs: if log is THREAD_STOP: please_stop.go() else: logger.write(**log) except Exception as e: print("problem in " + StructuredLogger_usingThread.__name__ + ": " + str(e)) finally: Log.note("stop the child") logger.stop() self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start()
def utf82unicode(value): """ WITH EXPLANATION FOR FAILURE """ try: return value.decode("utf8") except Exception as e: if not _Log: _late_import() if not is_binary(value): _Log.error("Can not convert {{type}} to unicode because it's not bytes", type= type(value).__name__) e = _Except.wrap(e) for i, c in enumerate(value): try: c.decode("utf8") except Exception as f: _Log.error("Can not convert charcode {{c}} in string index {{i}}", i=i, c=ord(c), cause=[e, _Except.wrap(f)]) try: latin1 = text_type(value.decode("latin1")) _Log.error("Can not explain conversion failure, but seems to be latin1", e) except Exception: pass try: a = text_type(value.decode("latin1")) _Log.error("Can not explain conversion failure, but seems to be latin1", e) except Exception: pass _Log.error("Can not explain conversion failure of " + type(value).__name__ + "!", e)
def __getitem__(self, index): offset = index - self.start if offset < len(self.buffer): return self.buffer[offset:offset + 1] if offset < 0: Log.error("Can not go in reverse on stream index=={{index}} (offset={{offset}})", index=index, offset=offset) if self._mark == -1: self.start += self.buffer_length offset = index - self.start self.buffer = self.get_more() self.buffer_length = len(self.buffer) while self.buffer_length <= offset: more = self.get_more() self.buffer += more self.buffer_length = len(self.buffer) return self.buffer[offset:offset+1] needless_bytes = self._mark - self.start if needless_bytes: self.start = self._mark offset = index - self.start self.buffer = self.buffer[needless_bytes:] self.buffer_length = len(self.buffer) while self.buffer_length <= offset: more = self.get_more() self.buffer += more self.buffer_length = len(self.buffer) try: return self.buffer[offset:offset+1] except Exception as e: Log.error("error", cause=e)
def _decode_object_items(index, c, parent_path, query_path, expected_vars): """ ITERATE THROUGH THE PROPERTIES OF AN OBJECT """ c, index = skip_whitespace(index) num_items = 0 while True: if c == b',': c, index = skip_whitespace(index) elif c == b'"': name, index = simple_token(index, c) if "name" in expected_vars: for i, e in enumerate(expected_vars): if e == "name": destination[i] = name c, index = skip_whitespace(index) if c != b':': Log.error("Expecting colon") c, index = skip_whitespace(index) child_expected = needed("value", expected_vars) index = _assign_token(index, c, child_expected) c, index = skip_whitespace(index) DEBUG and not num_items % 1000 and Log.note("{{num}} items iterated", num=num_items) yield index num_items += 1 elif c == b"}": break
def simple_token(index, c): if c == b'"': json.mark(index - 1) while True: c = json[index] index += 1 if c == b"\\": index += 1 elif c == b'"': break return json_decoder(json.release(index).decode("utf8")), index elif c in b"{[": json.mark(index-1) index = jump_to_end(index, c) value = wrap(json_decoder(json.release(index).decode("utf8"))) return value, index elif c == b"t" and json.slice(index, index + 3) == b"rue": return True, index + 3 elif c == b"n" and json.slice(index, index + 3) == b"ull": return None, index + 3 elif c == b"f" and json.slice(index, index + 4) == b"alse": return False, index + 4 else: json.mark(index-1) while True: c = json[index] if c in b',]}': break index += 1 text = json.release(index) try: return float(text), index except Exception: Log.error("Not a known JSON primitive: {{text|quote}}", text=text)
def _exec(code): try: temp = None exec("temp = " + code) return temp except Exception as e: Log.error("Could not execute {{code|quote}}", code=code, cause=e)
def _normalize_groupby(groupby, limit, schema=None): if groupby == None: return None output = wrap([n for ie, e in enumerate(listwrap(groupby)) for n in _normalize_group(e, ie, limit, schema=schema) ]) if any(o==None for o in output): Log.error("not expected") return output
def _expand(template, seq): """ seq IS TUPLE OF OBJECTS IN PATH ORDER INTO THE DATA TREE """ if is_text(template): return _simple_expand(template, seq) elif is_data(template): # EXPAND LISTS OF ITEMS USING THIS FORM # {"from":from, "template":template, "separator":separator} template = wrap(template) assert template["from"], "Expecting template to have 'from' attribute" assert template.template, "Expecting template to have 'template' attribute" data = seq[-1][template["from"]] output = [] for d in data: s = seq + (d,) output.append(_expand(template.template, s)) return coalesce(template.separator, "").join(output) elif is_list(template): return "".join(_expand(t, seq) for t in template) else: if not _Log: _late_import() _Log.error("can not handle")
def format_cube(decoders, aggs, start, query, select): # decoders = sorted(decoders, key=lambda d: -d.edge.dim) # REVERSE DECODER ORDER, BECAUSE ES QUERY WAS BUILT IN REVERSE ORDER new_edges = count_dim(aggs, decoders) dims = [] for e in new_edges: if isinstance(e.value, TupleOp): e.allowNulls = False extra = 0 if e.allowNulls is False else 1 dims.append(len(e.domain.partitions) + extra) dims = tuple(dims) matricies = [(s, Matrix(dims=dims, zeros=s.default)) for s in select] for row, coord, agg in aggs_iterator(aggs, decoders): for s, m in matricies: try: v = _pull(s, agg) m[coord] = v except Exception as e: Log.error("", e) cube = Cube( query.select, sorted(new_edges, key=lambda e: e.dim), # ENSURE EDGES ARE IN SAME ORDER AS QUERY {s.name: m for s, m in matricies} ) cube.frum = query return cube
def select(self, selectList, fromPath, varName, sourceVar): path = split_field(fromPath) is_deep = len(path) > 1 heads = [] list = [] for s in selectList: if is_deep: if s.value and is_variable_name(s.value): shortForm = self._translate(s.value) list.append("Value2Pipe(" + shortForm + ")\n") else: Log.error("do not know how to handle yet") else: if s.value and is_variable_name(s.value): list.append("Value2Pipe(getDocValue(" + value2MVEL(s.value) + "))\n") elif s.value: shortForm = self._translate(s.value) list.append("Value2Pipe(" + shortForm + ")\n") else: code, decode = self.Parts2Term(s.domain) heads.append(code.head) list.append("Value2Pipe(" + code.body + ")\n") if len(split_field(fromPath)) > 1: output = 'if (' + varName + ' != "") ' + varName + '+="|";\n' + varName + '+=' + '+"|"+'.join(["Value2Pipe("+v+")\n" for v in list]) + ';\n' else: output = varName + ' = ' + '+"|"+'.join(["Value2Pipe("+v+")\n" for v in list]) + ';\n' return Data( head="".join(heads), body=output )
def insert(self, docs): if not is_many(docs): Log.error("Expecting a list of documents") doc_collection = self.flatten_many(docs) self._insert(doc_collection)
def __init__( self, hg=None, # CONNECT TO hg repo=None, # CONNECTION INFO FOR ES CACHE use_cache=False, # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES timeout=30 * SECOND, kwargs=None, ): if not _hg_branches: _late_imports() if not is_text(repo.index): Log.error("Expecting 'index' parameter") self.repo_locker = Lock() self.moves_locker = Lock() self.todo = mo_threads.Queue("todo for hg daemon", max=DAEMON_QUEUE_SIZE) self.settings = kwargs self.timeout = Duration(timeout) self.last_cache_miss = Date.now() # VERIFY CONNECTIVITY with Explanation("Test connect with hg"): http.head(self.settings.hg.url) set_default(repo, { "type": "revision", "schema": revision_schema, }) kwargs.branches = set_default( { "index": repo.index + "-branches", "type": "branch", }, repo, ) moves = set_default( { "index": repo.index + "-moves", }, repo, ) self.branches = _hg_branches.get_branches(kwargs=kwargs) cluster = elasticsearch.Cluster(kwargs=repo) self.repo = cluster.get_or_create_index(kwargs=repo) self.moves = cluster.get_or_create_index(kwargs=moves) def setup_es(please_stop): with suppress_exception: self.repo.add_alias() with suppress_exception: self.moves.add_alias() with suppress_exception: self.repo.set_refresh_interval(seconds=1) with suppress_exception: self.moves.set_refresh_interval(seconds=1) Thread.run("setup_es", setup_es) Thread.run("hg daemon", self._daemon)
def _get_push(self, branch, changeset_id): if self.repo.cluster.version.startswith("1.7."): query = { "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [ { "term": { "branch.name": branch.name } }, { "prefix": { "changeset.id": changeset_id[0:12] } }, ] }, } }, "size": 1, } else: query = { "query": { "bool": { "must": [ { "term": { "branch.name": branch.name } }, { "prefix": { "changeset.id": changeset_id[0:12] } }, ] } }, "size": 1, } try: # ALWAYS TRY ES FIRST with self.repo_locker: response = self.repo.search(query) json_push = response.hits.hits[0]._source.push if json_push: return json_push except Exception: pass url = branch.url.rstrip( "/") + "/json-pushes?full=1&changeset=" + changeset_id with Explanation("Pulling pushlog from {{url}}", url=url, debug=DEBUG): Log.note("Reading pushlog from {{url}}", url=url, changeset=changeset_id) data = self._get_and_retry(url, branch) # QUEUE UP THE OTHER CHANGESETS IN THE PUSH self.todo.add( (branch, [c.node for cs in data.values().changesets for c in cs], None)) pushes = [ Push(id=int(index), date=_push.date, user=_push.user) for index, _push in data.items() ] if len(pushes) == 0: return Null elif len(pushes) == 1: return pushes[0] else: Log.error("do not know what to do")
def get_revision(self, revision, locale=None, get_diff=False, get_moves=True, after=None): """ EXPECTING INCOMPLETE revision OBJECT RETURNS revision """ rev = revision.changeset.id if not rev: return Null elif rev == "None": return Null elif revision.branch.name == None: return Null locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE) output = self._get_from_elasticsearch(revision, locale=locale, get_diff=get_diff, get_moves=get_moves, after=after) if output: if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None if not get_moves: output.changeset.moves = None DEBUG and Log.note( "Got hg ({{branch}}, {{locale}}, {{revision}}) from ES", branch=output.branch.name, locale=locale, revision=output.changeset.id, ) if output.push.date >= Date.now() - MAX_TODO_AGE: self.todo.add((output.branch, listwrap(output.parents), None)) self.todo.add((output.branch, listwrap(output.children), None)) if output.push.date: return output # RATE LIMIT CALLS TO HG (CACHE MISSES) next_cache_miss = self.last_cache_miss + ( Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND) self.last_cache_miss = Date.now() if next_cache_miss > self.last_cache_miss: Log.note( "delaying next hg call for {{seconds|round(decimal=1)}} seconds", seconds=next_cache_miss - self.last_cache_miss, ) Till(till=next_cache_miss.unix).wait() found_revision = copy(revision) if isinstance(found_revision.branch, (text, binary_type)): lower_name = found_revision.branch.lower() else: lower_name = found_revision.branch.name.lower() if not lower_name: Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch) b = found_revision.branch = self.branches[(lower_name, locale)] if not b: b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)] if not b: Log.warning( "can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale, ) return Null if Date.now() - Date(b.etl.timestamp) > _hg_branches.OLD_BRANCH: self.branches = _hg_branches.get_branches(kwargs=self.settings) push = self._get_push(found_revision.branch, found_revision.changeset.id) id12 = found_revision.changeset.id[0:12] url1 = found_revision.branch.url.rstrip( "/") + "/json-info?node=" + id12 url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + id12 url3 = (found_revision.branch.url.rstrip("/") + "/json-automationrelevance/" + id12) with Explanation("get revision from {{url}}", url=url1, debug=DEBUG): raw_rev2 = Null automation_details = Null try: raw_rev1 = self._get_raw_json_info(url1, found_revision.branch) raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch) automation_details = self._get_raw_json_rev( url3, found_revision.branch) except Exception as e: if "Hg denies it exists" in e: raw_rev1 = Data(node=revision.changeset.id) else: raise e raw_rev3_changeset = first(r for r in automation_details.changesets if r.node[:12] == id12) if last(automation_details.changesets) != raw_rev3_changeset: Log.note("interesting") output = self._normalize_revision( set_default(raw_rev1, raw_rev2, raw_rev3_changeset), found_revision, push, get_diff, get_moves, ) if output.push.date >= Date.now() - MAX_TODO_AGE: self.todo.add((output.branch, listwrap(output.parents), None)) self.todo.add((output.branch, listwrap(output.children), None)) self.todo.add((output.branch, listwrap(output.backsoutnodes), output.push.date)) if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None if not get_moves: output.changeset.moves = None return output
def milli(self, value): if not isinstance(value, float): from mo_logs import Log Log.error("not allowed") self._milli = value
def select(self, fields): if is_data(fields): fields = fields.value if is_text(fields): # RETURN LIST OF VALUES if len(split_field(fields)) == 1: if self.path[0] == fields: return [d[1] for d in self.data] else: return [d[0][fields] for d in self.data] else: keys = split_field(fields) depth = coalesce( MIN([ i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p ]), len(self.path)) # LENGTH OF COMMON PREFIX short_key = keys[depth:] output = FlatList() _select1((wrap(d[depth]) for d in self.data), short_key, 0, output) return output if is_list(fields): output = FlatList() meta = [] for f in fields: if hasattr(f.value, "__call__"): meta.append((f.name, f.value)) else: meta.append( (f.name, functools.partial(lambda v, d: d[v], f.value))) for row in self._values(): agg = Data() for name, f in meta: agg[name] = f(row) output.append(agg) return output # meta = [] # for f in fields: # keys = split_field(f.value) # depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX # short_key = join_field(keys[depth:]) # # meta.append((f.name, depth, short_key)) # # for row in self._data: # agg = Data() # for name, depth, short_key in meta: # if short_key: # agg[name] = row[depth][short_key] # else: # agg[name] = row[depth] # output.append(agg) # return output Log.error("multiselect over FlatList not supported")
def update(self, command): """ :param command: EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT """ command = wrap(command) clear_columns = set(listwrap(command['clear'])) # REJECT DEEP UPDATES touched_columns = command.set.keys() | clear_columns for c in self.schema.columns: if c.name in touched_columns and len(c.nested_path) > 1: Log.error("Deep update not supported") # ADD NEW COLUMNS where = jx_expression(command.where) or TRUE _vars = where.vars() _map = { v: c.es_column for v in _vars for c in self.columns.get(v, Null) if c.jx_type not in STRUCT } where_sql = where.map(_map).to_sql(self.schema)[0].sql.b new_columns = set(command.set.keys()) - set( c.name for c in self.schema.columns) for new_column_name in new_columns: nested_value = command.set[new_column_name] ctype = get_jx_type(nested_value) column = Column(name=new_column_name, jx_type=ctype, es_index=self.name, es_type=json_type_to_sqlite_type(ctype), es_column=typed_column(new_column_name, ctype), last_updated=Date.now()) self.add_column(column) # UPDATE THE NESTED VALUES for nested_column_name, nested_value in command.set.items(): if get_jx_type(nested_value) == "nested": nested_table_name = concat_field(self.name, nested_column_name) nested_table = nested_tables[nested_column_name] self_primary_key = sql_list( quote_column(c.es_column) for u in self.uid for c in self.columns[u]) extra_key_name = UID + text(len(self.uid)) extra_key = [e for e in nested_table.columns[extra_key_name]][0] sql_command = ( SQL_DELETE + SQL_FROM + quote_column(nested_table.name) + SQL_WHERE + "EXISTS" + sql_iso(SQL_SELECT + SQL_ONE + SQL_FROM + sql_alias(quote_column(nested_table.name), "n") + SQL_INNER_JOIN + sql_iso(SQL_SELECT + self_primary_key + SQL_FROM + quote_column(abs_schema.fact) + SQL_WHERE + where_sql) + " t ON " + SQL_AND.join( quote_column("t", c.es_column) + SQL_EQ + quote_column("n", c.es_column) for u in self.uid for c in self.columns[u]))) self.db.execute(sql_command) # INSERT NEW RECORDS if not nested_value: continue doc_collection = {} for d in listwrap(nested_value): nested_table.flatten(d, Data(), doc_collection, path=nested_column_name) prefix = SQL_INSERT + quote_column(nested_table.name) + sql_iso( sql_list([self_primary_key] + [quote_column(extra_key)] + [ quote_column(c.es_column) for c in doc_collection.get(".", Null).active_columns ])) # BUILD THE PARENT TABLES parent = (SQL_SELECT + self_primary_key + SQL_FROM + quote_column(abs_schema.fact) + SQL_WHERE + jx_expression(command.where).to_sql(schema)) # BUILD THE RECORDS children = SQL_UNION_ALL.join( SQL_SELECT + sql_alias(quote_value(i), extra_key.es_column) + SQL_COMMA + sql_list( sql_alias(quote_value(row[c.name]), quote_column(c.es_column)) for c in doc_collection.get(".", Null).active_columns) for i, row in enumerate( doc_collection.get(".", Null).rows)) sql_command = (prefix + SQL_SELECT + sql_list([ quote_column("p", c.es_column) for u in self.uid for c in self.columns[u] ] + [quote_column("c", extra_key)] + [ quote_column("c", c.es_column) for c in doc_collection.get(".", Null).active_columns ]) + SQL_FROM + sql_iso(parent) + " p" + SQL_INNER_JOIN + sql_iso(children) + " c" + SQL_ON + SQL_TRUE) self.db.execute(sql_command) # THE CHILD COLUMNS COULD HAVE EXPANDED # ADD COLUMNS TO SELF for n, cs in nested_table.columns.items(): for c in cs: column = Column(name=c.name, jx_type=c.jx_type, es_type=c.es_type, es_index=c.es_index, es_column=c.es_column, nested_path=[nested_column_name] + c.nested_path, last_updated=Date.now()) if c.name not in self.columns: self.columns[column.name] = {column} elif c.jx_type not in [ c.jx_type for c in self.columns[c.name] ]: self.columns[column.name].add(column) command = ConcatSQL( SQL_UPDATE, quote_column(self.name), SQL_SET, sql_list([ quote_column(c.es_column) + SQL_EQ + quote_value(get_if_type(v, c.jx_type)) for c in self.schema.columns if c.jx_type != NESTED and len(c.nested_path) == 1 for v in [command.set[c.name]] if v != None ] + [ quote_column(c.es_column) + SQL_EQ + SQL_NULL for c in self.schema.columns if (c.name in clear_columns and command.set[c.name] != None and c.jx_type != NESTED and len(c.nested_path) == 1) ]), SQL_WHERE, where_sql) with self.db.transaction() as t: t.execute(command)
def es_deepop(es, query): schema = query.frum.schema columns = schema.columns query_path = schema.query_path # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT post_expressions = {} es_query, es_filters = es_query_template(query_path) # SPLIT WHERE CLAUSE BY DEPTH wheres = split_expression_by_depth(query.where, schema) for i, f in enumerate(es_filters): script = AndOp("and", wheres[i]).partial_eval().to_esfilter(schema) set_default(f, script) if not wheres[1]: more_filter = { "bool": { "must": [AndOp("and", wheres[0]).partial_eval().to_esfilter(schema)], "must_not": { "nested": { "path": query_path, "query": { "match_all": {} } } } } } else: more_filter = None es_query.size = coalesce(query.limit, DEFAULT_LIMIT) # es_query.sort = jx_sort_to_es_sort(query.sort) map_to_es_columns = schema.map_to_es() # {c.names["."]: c.es_column for c in schema.leaves(".")} query_for_es = query.map(map_to_es_columns) es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema) es_query.stored_fields = [] is_list = isinstance(query.select, list) new_select = FlatList() i = 0 for s in listwrap(query.select): if isinstance(s.value, LeavesOp) and isinstance( s.value.term, Variable): # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS leaves = schema.leaves(s.value.term.var) col_names = set() for c in leaves: if c.nested_path[0] == ".": if c.type == NESTED: continue es_query.stored_fields += [c.es_column] c_name = untype_path(c.names[query_path]) col_names.add(c_name) new_select.append({ "name": concat_field(s.name, c_name), "nested_path": c.nested_path[0], "put": { "name": concat_field(s.name, literal_field(c_name)), "index": i, "child": "." }, "pull": get_pull_function(c) }) i += 1 # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS for n in new_select: if n.name.startswith("..") and n.name.lstrip( ".") not in col_names: n.put.name = n.name = n.name.lstrip(".") col_names.add(n.name) elif isinstance(s.value, Variable): net_columns = schema.leaves(s.value.var) if not net_columns: new_select.append({ "name": s.name, "nested_path": ".", "put": { "name": s.name, "index": i, "child": "." }, "pull": NULL }) else: for n in net_columns: pull = get_pull_function(n) if n.nested_path[0] == ".": if n.type == NESTED: continue es_query.stored_fields += [n.es_column] # WE MUST FIGURE OUT WHICH NAMESSPACE s.value.var IS USING SO WE CAN EXTRACT THE child for np in n.nested_path: c_name = untype_path(n.names[np]) if startswith_field(c_name, s.value.var): child = relative_field(c_name, s.value.var) break else: child = relative_field( untype_path(n.names[n.nested_path[0]]), s.value.var) new_select.append({ "name": s.name, "pull": pull, "nested_path": n.nested_path[0], "put": { "name": s.name, "index": i, "child": child } }) i += 1 else: expr = s.value for v in expr.vars(): for c in schema[v]: if c.nested_path[0] == ".": es_query.stored_fields += [c.es_column] # else: # Log.error("deep field not expected") pull_name = EXPRESSION_PREFIX + s.name map_to_local = { untype_path(k): get_pull(cc) for k, c in schema.lookup.items() for cc in c if cc.type not in STRUCT } pull = jx_expression_to_function(pull_name) post_expressions[pull_name] = compile_expression( expr.map(map_to_local).to_python()) new_select.append({ "name": s.name if is_list else ".", "pull": pull, "value": expr.__data__(), "put": { "name": s.name, "index": i, "child": "." } }) i += 1 # <COMPLICATED> ES needs two calls to get all documents more = [] def get_more(please_stop): more.append( es_post( es, Data(query=more_filter, stored_fields=es_query.stored_fields), query.limit)) if more_filter: need_more = Thread.run("get more", target=get_more) with Timer("call to ES") as call_timer: data = es_post(es, es_query, query.limit) # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED def inners(): for t in data.hits.hits: for i in t.inner_hits[literal_field(query_path)].hits.hits: t._inner = i._source for k, e in post_expressions.items(): t[k] = e(t) yield t if more_filter: Thread.join(need_more) for t in more[0].hits.hits: yield t #</COMPLICATED> try: formatter, groupby_formatter, mime_type = format_dispatch[query.format] output = formatter(inners(), new_select, query) output.meta.timing.es = call_timer.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: Log.error("problem formatting", e)
def __init__(self, **desc): Domain.__init__(self, **desc) desc = wrap(desc) self.type = "set" self.order = {} self.NULL = Null self.partitions = FlatList() self.primitive = True # True IF DOMAIN IS A PRIMITIVE VALUE SET if isinstance(self.key, set): Log.error("problem") if not desc.key and (len(desc.partitions) == 0 or isinstance(desc.partitions[0], (basestring, Number, tuple))): # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS self.key = "value" self.map = {} self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): part = {"name": p, "value": p, "dataIndex": i} self.partitions.append(part) self.map[p] = part self.order[p] = i self.label = coalesce(self.label, "name") self.primitive = True return if desc.partitions and desc.dimension.fields and len( desc.dimension.fields) > 1: self.key = desc.key self.map = UniqueIndex(keys=desc.dimension.fields) elif desc.partitions and isinstance(desc.key, (list, set)): # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE self.key = desc.key self.map = UniqueIndex(keys=desc.key) elif desc.partitions and isinstance(desc.partitions[0][desc.key], Mapping): self.key = desc.key self.map = UniqueIndex(keys=desc.key) # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions) # self.map = UniqueIndex(keys=self.key) elif len(desc.partitions) == 0: # CREATE AN EMPTY DOMAIN self.key = "value" self.map = {} self.order[None] = 0 self.label = coalesce(self.label, "name") return elif desc.key == None: if desc.partitions and all(desc.partitions.where) or all( desc.partitions.esfilter): if not all(desc.partitions.name): Log.error("Expecting all partitions to have a name") from pyLibrary.queries.expressions import jx_expression self.key = "name" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.partitions.append({ "where": jx_expression(coalesce(p.where, p.esfilter)), "name": p.name, "dataIndex": i }) self.map[p.name] = p self.order[p.name] = i return elif desc.partitions and len(set(desc.partitions.value) - {None}) == len(desc.partitions): # TRY A COMMON KEY CALLED "value". IT APPEARS UNIQUE self.key = "value" self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Domains must have keys, or partitions") elif self.key: self.key = desc.key self.map = dict() self.map[None] = self.NULL self.order[None] = len(desc.partitions) for i, p in enumerate(desc.partitions): self.map[p[self.key]] = p self.order[p[self.key]] = i self.primitive = False else: Log.error("Can not hanldle") self.label = coalesce(self.label, "name") if hasattr(desc.partitions, "__iter__"): self.partitions = wrap(list(desc.partitions)) else: Log.error("expecting a list of partitions")
def _flatten(data, uid, parent_id, order, full_path, nested_path, row=None, guid=None): """ :param data: the data we are pulling apart :param uid: the uid we are giving this doc :param parent_id: the parent id of this (sub)doc :param order: the number of siblings before this one :param full_path: path to this (sub)doc :param nested_path: list of paths, deepest first :param row: we will be filling this :return: """ table = concat_field(self.name, nested_path[0]) insertion = doc_collection[nested_path[0]] if not row: row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order} insertion.rows.append(row) if is_data(data): items = [(concat_field(full_path, k), v) for k, v in wrap(data).leaves()] else: # PRIMITIVE VALUES items = [(full_path, data)] for cname, v in items: jx_type = get_jx_type(v) if jx_type is None: continue insertion = doc_collection[nested_path[0]] if jx_type == NESTED: c = first(cc for cc in insertion.active_columns + snowflake.columns if cc.jx_type in STRUCT and untyped_column(cc.name)[0] == cname) else: c = first(cc for cc in insertion.active_columns + snowflake.columns if cc.jx_type == jx_type and cc.name == cname) if isinstance(c, list): Log.error("confused") if not c: # WHAT IS THE NESTING LEVEL FOR THIS PATH? deeper_nested_path = "." for path in snowflake.query_paths: if startswith_field(cname, path[0]) and len( deeper_nested_path) < len(path): deeper_nested_path = path c = Column(name=cname, jx_type=jx_type, es_type=json_type_to_sqlite_type.get( jx_type, jx_type), es_column=typed_column( cname, json_type_to_sql_type.get(jx_type)), es_index=table, nested_path=nested_path, last_updated=Date.now()) if jx_type == NESTED: snowflake.query_paths.append(c.es_column) required_changes.append({'nest': c}) else: insertion.active_columns.add(c) required_changes.append({"add": c}) elif c.jx_type == NESTED and jx_type == OBJECT: # ALWAYS PROMOTE OBJECTS TO NESTED jx_type = NESTED v = [v] elif len(c.nested_path) < len(nested_path): from_doc = doc_collection.get(c.nested_path[0], None) column = c.es_column from_doc.active_columns.remove(c) snowflake._remove_column(c) required_changes.append({"nest": c}) deep_c = Column(name=cname, jx_type=jx_type, es_type=json_type_to_sqlite_type.get( jx_type, jx_type), es_column=typed_column( cname, json_type_to_sql_type.get(jx_type)), es_index=table, nested_path=nested_path, last_updated=Date.now()) snowflake._add_column(deep_c) snowflake._drop_column(c) from_doc.active_columns.remove(c) for r in from_doc.rows: r1 = unwrap(r) if column in r1: row1 = { UID: self.container.next_uid(), PARENT: r1["__id__"], ORDER: 0, column: r1[column] } insertion.rows.append(row1) elif len(c.nested_path) > len(nested_path): insertion = doc_collection[c.nested_path[0]] row = { UID: self.container.next_uid(), PARENT: uid, ORDER: order } insertion.rows.append(row) # BE SURE TO NEST VALUES, IF NEEDED if jx_type == NESTED: deeper_nested_path = [cname] + nested_path if not doc_collection.get(cname): doc_collection[cname] = Data(active_columns=Queue(), rows=[]) for i, r in enumerate(v): child_uid = self.container.next_uid() _flatten(r, child_uid, uid, i, cname, deeper_nested_path) elif jx_type == OBJECT: _flatten(v, uid, parent_id, order, cname, nested_path, row=row) elif c.jx_type: row[c.es_column] = v
def bytes2sha1(value): if is_text(value): Log.error("can not convert unicode to sha1") sha = hashlib.sha1(value) return sha.hexdigest()
def getDomain(self): Log.error("Not implemented")
def error(self, message): Log.error("argparse error: {{error}}", error=message)
def __bool__(self): Log.error( "Detecting truthiness of NullOp is too confusing to be allowed")
assertAlmostEqual(a, b, msg=msg, digits=digits, places=places, delta=delta) else: assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta) except Exception as e: Log.error("{{test|json}} does not match expected {{expected|json}}", test=test if show_detail else "[can not show]", expected=expected if show_detail else "[can not show]", cause=e) def assertAlmostEqualValue(test, expected, digits=None, places=None, msg=None, delta=None): """ Snagged from unittest/case.py, then modified (Aug2014) """ if expected.__class__.__name__ == "NullOp": if test == None: return
def login(self, please_stop=None): """ WILL REGISTER THIS DEVICE, AND SHOW A QR-CODE TO LOGIN WILL POLL THE SERVICE ENDPOINT UNTIL LOGIN IS COMPLETED, OR FAILED :param please_stop: SIGNAL TO STOP EARLY :return: SESSION THAT CAN BE USED TO SEND AUTHENTICATED REQUESTS """ # SEND PUBLIC KEY now = Date.now().unix login_session = requests.session() signed = rsa_crypto.sign( Data(public_key=self.public_key, timestamp=now), self.private_key) DEBUG and Log.note("register (unsigned)\n{{request|json}}", request=rsa_crypto.verify(signed, self.public_key)) DEBUG and Log.note("register (signed)\n{{request|json}}", request=signed) try: response = login_session.request( "POST", str(URL(self.config.service) / self.config.endpoints.register), data=value2json(signed)) except Exception as e: raise Log.error("problem registering device", cause=e) device = wrap(response.json()) DEBUG and Log.note("response:\n{{response}}", response=device) device.interval = parse(device.interval).seconds expires = Till(till=parse(device.expires).unix) session_id = self.session_id = device.session_id if not session_id: Log.error("expecting a session cookie") # SHOW URL AS QR CODE image = text2QRCode(device.url) sys.stdout.write("\n\nLogin using thie URL:\n") sys.stdout.write(device.url + CR) sys.stdout.write(image) while not please_stop and not expires: Log.note("waiting for login...") try: now = Date.now() signed = rsa_crypto.sign( Data(timestamp=now, session_id=session_id), self.private_key) url = URL(self.config.service) / self.config.endpoints.status DEBUG and Log.note("ping (unsigned) {{url}}\n{{request|json}}", url=url, request=rsa_crypto.verify( signed, self.public_key)) response = login_session.request("POST", url, data=value2json(signed)) ping = wrap(response.json()) DEBUG and Log.note("response\n{{response|json}}", response=ping) if ping.status == "verified": return self if not ping.try_again: Log.note("Failed to login {{reason}}", reason=ping.status) return except Exception as e: Log.warning( "problem calling {{url}}", url=URL(self.config.service) / self.config.endpoints.status, cause=e, ) (Till(seconds=device.interval) | please_stop | expires).wait() return self
def query(self, query=None): """ :param query: JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT :return: """ if not query: query = {} if not query.get('from'): query['from'] = self.name elif not startswith_field(query['from'], self.name): Log.error("Expecting table, or some nested table") query = QueryOp.wrap(query, self.container, self.namespace) new_table = "temp_" + unique_name() if query.format == "container": create_table = SQL_CREATE + quote_column(new_table) + SQL_AS else: create_table = "" if query.groupby and query.format != "cube": op, index_to_columns = self._groupby_op(query, self.schema) command = create_table + op elif query.groupby: query.edges, query.groupby = query.groupby, query.edges op, index_to_columns = self._edges_op(query, self.schema) command = create_table + op query.edges, query.groupby = query.groupby, query.edges elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate): op, index_to_columns = self._edges_op(query, query.frum.schema) command = create_table + op else: op = self._set_op(query) return op result = self.db.query(command) if query.format == "container": output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True) elif query.format == "cube" or (not query.format and query.edges): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name if len(query.edges) == 0 and len(query.groupby) == 0: data = {n: Data() for n in column_names} for s in index_to_columns.values(): data[s.push_name][s.push_child] = unwrap(s.pull(result.data[0])) if is_list(query.select): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( data=unwrap(data), select=select, meta={"format": "cube"} ) if not result.data: edges = [] dims = [] for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif is_op(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: domain = SimpleSetDomain(partitions=[]) dims.append(1 if allowNulls else 0) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data[s.name] = Matrix(dims=dims, zeros=0) else: data[s.name] = Matrix(dims=dims) if is_list(query.select): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data.items()} ) columns = None edges = [] dims = [] for g in query.groupby: g.is_groupby = True for i, e in enumerate(query.edges + query.groupby): allowNulls = coalesce(e.allowNulls, True) if e.domain.type == "set" and e.domain.partitions: domain = SimpleSetDomain(partitions=e.domain.partitions.name) elif e.domain.type == "range": domain = e.domain elif e.domain.type == "time": domain = wrap(mo_json.scrub(e.domain)) elif e.domain.type == "duration": domain = wrap(mo_json.scrub(e.domain)) elif is_op(e.value, TupleOp): pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull parts = [tuple(p(d) for p in pulls) for d in result.data] domain = SimpleSetDomain(partitions=jx.sort(set(parts))) else: if not columns: columns = transpose(*result.data) parts = set(columns[i]) if e.is_groupby and None in parts: allowNulls = True parts -= {None} if query.sort[i].sort == -1: domain = SimpleSetDomain(partitions=wrap(sorted(parts, reverse=True))) else: domain = SimpleSetDomain(partitions=jx.sort(parts)) dims.append(len(domain.partitions) + (1 if allowNulls else 0)) edges.append(Data( name=e.name, allowNulls=allowNulls, domain=domain )) data_cubes = {} for si, s in enumerate(listwrap(query.select)): if s.aggregate == "count": data_cubes[s.name] = Matrix(dims=dims, zeros=0) else: data_cubes[s.name] = Matrix(dims=dims) r2c = index_to_coordinate(dims) # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM for rownum, row in enumerate(result.data): coord = r2c(rownum) for i, s in enumerate(index_to_columns.values()): if s.is_edge: continue if s.push_child == ".": data_cubes[s.push_name][coord] = s.pull(row) else: data_cubes[s.push_name][coord][s.push_child] = s.pull(row) if query.select == None: select = Null elif is_list(query.select): select = [{"name": s.name} for s in query.select] else: select = {"name": query.select.name} return Data( meta={"format": "cube"}, edges=edges, select=select, data={k: v.cube for k, v in data_cubes.items()} ) elif query.format == "table" or (not query.format and query.groupby): column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1) for c in index_to_columns.values(): column_names[c.push_column] = c.push_column_name data = [] for d in result.data: row = [None for _ in column_names] for s in index_to_columns.values(): if s.push_child == ".": row[s.push_column] = s.pull(d) elif s.num_push_columns: tuple_value = row[s.push_column] if tuple_value == None: tuple_value = row[s.push_column] = [None] * s.num_push_columns tuple_value[s.push_child] = s.pull(d) elif row[s.push_column] == None: row[s.push_column] = Data() row[s.push_column][s.push_child] = s.pull(d) else: row[s.push_column][s.push_child] = s.pull(d) data.append(tuple(unwrap(r) for r in row)) output = Data( meta={"format": "table"}, header=column_names, data=data ) elif query.format == "list" or (not query.edges and not query.groupby): if not query.edges and not query.groupby and any(listwrap(query.select).aggregate): if is_list(query.select): data = Data() for c in index_to_columns.values(): if c.push_child == ".": if data[c.push_name] == None: data[c.push_name] = c.pull(result.data[0]) elif is_list(data[c.push_name]): data[c.push_name].append(c.pull(result.data[0])) else: data[c.push_name] = [data[c.push_name], c.pull(result.data[0])] else: data[c.push_name][c.push_child] = c.pull(result.data[0]) output = Data( meta={"format": "value"}, data=data ) else: data = Data() for s in index_to_columns.values(): if not data[s.push_child]: data[s.push_child] = s.pull(result.data[0]) else: data[s.push_child] += [s.pull(result.data[0])] output = Data( meta={"format": "value"}, data=unwrap(data) ) else: data = [] for rownum in result.data: row = Data() for c in index_to_columns.values(): if c.push_child == ".": row[c.push_name] = c.pull(rownum) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(rownum) else: row[c.push_name][c.push_child] = c.pull(rownum) data.append(row) output = Data( meta={"format": "list"}, data=data ) else: Log.error("unknown format {{format}}", format=query.format) return output
def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta=None): show_detail = True test = unwrap(test) expected = unwrap(expected) try: if test is None and expected is None: return elif isinstance(test, UniqueIndex): if test ^ expected: Log.error("Sets do not match") elif isinstance(expected, Mapping) and isinstance(test, Mapping): for k, v2 in unwrap(expected).items(): v1 = test.get(k) assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta) elif isinstance(expected, Mapping): for k, v2 in expected.items(): if isinstance(k, basestring): v1 = mo_dots.get_attr(test, literal_field(k)) else: v1 = test[k] assertAlmostEqual(v1, v2, msg=msg, digits=digits, places=places, delta=delta) elif isinstance(test, (set, list)) and isinstance(expected, set): test = set(test) if len(test) != len(expected): Log.error( "Sets do not match, element count different:\n{{test|json|indent}}\nexpecting{{expectedtest|json|indent}}", test=test, expected=expected) for e in expected: for t in test: try: assertAlmostEqual(t, e, msg=msg, digits=digits, places=places, delta=delta) break except Exception, _: pass else: Log.error( "Sets do not match. {{value|json}} not found in {{test|json}}", value=e, test=test) elif isinstance(expected, types.FunctionType): return expected(test)
def __init__(self, value): SQL.__init__(self) if DEBUG and isinstance(value, SQL): Log.error("Expecting text, not SQL") self.value = value
def add(self, record): if isinstance(record, list): Log.error("no longer accepting lists, use extend()") return self.extend([record])
def __new__(cls, e=None, query=None, *args, **kwargs): e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": # if query.groupby: # return object.__new__(DefaultDecoder, e) if is_text(e.value): Log.error("Expecting Variable or Expression, not plain string") if is_op(e.value, LeavesOp): return object.__new__(ObjectDecoder) elif is_op(e.value, TupleOp): # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS if not all(is_op(t, Variable) for t in e.value.terms): Log.error("Can only handle variables in tuples") e.domain = Data(dimension={"fields": e.value.terms}) return object.__new__(DimFieldListDecoder) elif is_op(e.value, Variable): schema = query.frum.schema cols = schema.leaves(e.value.var) if not cols: return object.__new__(DefaultDecoder) if len(cols) != 1: return object.__new__(ObjectDecoder) col = first(cols) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.cardinality == None: DEBUG and Log.warning( "metadata for column {{name|quote}} (id={{id}}) is not ready", name=concat_field(col.es_index, col.es_column), id=id(col)) e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) elif col.partitions == None: e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) else: DEBUG and Log.note("id={{id}} has parts!!!", id=id(col)) if col.multi > 1: return object.__new__(MultivalueDecoder) partitions = col.partitions[:limit:] if e.domain.sort == -1: partitions = list(reversed(sorted(partitions))) else: partitions = sorted(partitions) e.domain = SimpleSetDomain(partitions=partitions, limit=limit) else: return object.__new__(DefaultDecoder) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder) if e.range: return object.__new__(GeneralRangeDecoder) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if is_data(fields): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def query_metadata(self, query): frum, query['from'] = query['from'], self schema = self.snowflake.tables["."].schema query = QueryOp.wrap(query, schema) columns = self.snowflake.columns where = query.where table_name = None column_name = None if query.edges or query.groupby: Log.error("Aggregates(groupby or edge) are not supported") if where.op == "eq" and where.lhs.var == "table": table_name = mo_json.json2value(where.rhs.json) elif where.op == "eq" and where.lhs.var == "name": column_name = mo_json.json2value(where.rhs.json) else: Log.error("Only simple filters are expected like: \"eq\" on table and column name") tables = [concat_field(self.snowflake.fact_name, i) for i in self.tables.keys()] metadata = [] if columns[-1].es_column != GUID: columns.append(Column( name=GUID, jx_type=STRING, es_column=GUID, es_index=self.snowflake.fact_name, nested_path=["."] )) for tname, table in zip(t, tables): if table_name != None and table_name != table: continue for col in columns: cname, ctype = untyped_column(col.es_column) if column_name != None and column_name != cname: continue metadata.append((table, relative_field(col.name, tname), col.type, unwraplist(col.nested_path))) if query.format == "cube": num_rows = len(metadata) header = ["table", "name", "type", "nested_path"] temp_data = dict(zip(header, zip(*metadata))) return Data( meta={"format": "cube"}, data=temp_data, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) elif query.format == "table": header = ["table", "name", "type", "nested_path"] return Data( meta={"format": "table"}, header=header, data=metadata ) else: header = ["table", "name", "type", "nested_path"] return Data( meta={"format": "list"}, data=[dict(zip(header, r)) for r in metadata] )
def append_query(self, query_path, es_query): Log.error("Not supported")
def __iter__(self): raise Log.error("not implemented")
def verify_attributes_not_null(self, attribute_names): for name in attribute_names: if getattr(self, name) == None: Log.error('{{type}} domain expects a {{name|quote}} parameter', type=self.type, name=name)
def get_index(self, row, es_query=None, index=None): try: key = row[0].get('key') return self.domain.getIndexByKey(key) except Exception as e: Log.error("problem", cause=e)
def __init__( self, host, index, # THE NAME OF THE SNOWFLAKE (IF WRITING) alias=None, # THE NAME OF THE SNOWFLAKE (FOR READING) type=None, name=None, # THE FULL NAME OF THE TABLE (THE NESTED PATH INTO THE SNOWFLAKE) port=9200, read_only=True, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) wait_for_active_shards=1, # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency) typed=None, kwargs=None, ): Container.__init__(self) if not container.config.default: container.config.default = { "type": "elasticsearch", "settings": unwrap(kwargs), } self.edges = Data() # SET EARLY, SO OTHER PROCESSES CAN REQUEST IT self.worker = None self.settings = kwargs self._namespace = ElasticsearchMetadata(kwargs=kwargs) self.name = name = self._namespace._find_alias( coalesce(alias, index, name)) if read_only: self.es = elasticsearch.Alias(alias=name, index=None, kwargs=kwargs) else: self.es = (elasticsearch.Cluster(kwargs=kwargs).get_index( read_only=read_only, kwargs=kwargs)) self._ensure_max_result_window_set(name) self.settings.type = self.es.settings.type self.stats = QueryStats(self.es.cluster) columns = self.snowflake.columns # ABSOLUTE COLUMNS is_typed = any(c.es_column == EXISTS_TYPE for c in columns) if typed == None: # SWITCH ON TYPED MODE self.typed = is_typed else: if is_typed != typed: Log.error( "Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed, ) self.typed = typed if not typed: # ADD EXISTENCE COLUMNS all_paths = {".": None} # MAP FROM path TO parent TO MAKE A TREE def nested_path_of(v): if v == ".": return (".", ) return (v, ) + nested_path_of(all_paths[v]) query_paths = sort_using_key( set(step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p)), ) for step in query_paths: if step in all_paths: continue else: best = "." for candidate in all_paths.keys(): if startswith_field(step, candidate): if startswith_field(candidate, best): best = candidate all_paths[step] = best for p in all_paths.keys(): if p == ".": nested_path = (".", ) else: nested_path = nested_path_of(p)[1:] jx_type = OBJECT if p == "." else NESTED self.namespace.meta.columns.add( Column( name=p, es_column=p, es_index=self.name, es_type=jx_type, jx_type=jx_type, cardinality=1, nested_path=nested_path, multi=1001 if jx_type is NESTED else 1, last_updated=Date.now(), ))
def extract(self, settings, force, restart, start, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if start: state = start, 0 elif restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, job_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}", last_modified=last_modified, job_id=job_id, ) # Example: job.id ==283890114 # get_ids = ConcatSQL( # (SQL_SELECT, sql_alias(quote_value(283890114), "id")) # ) get_ids = sql_query({ "from": "job", "select": ["id"], "where": { "or": [ { "gt": { "last_modified": Date(last_modified) } }, { "and": [ { "eq": { "last_modified": Date(last_modified) } }, { "gt": { "id": job_id } }, ] }, ] }, "sort": ["last_modified", "id"], "limit": settings.extractor.chunk_size, }) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break # SOME LIMITS PLACES ON STRING SIZE for fl in jx.drill(acc, "job_log.failure_line"): fl.message = strings.limit(fl.message, 10000) destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, job_id = last_doc.last_modified, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, job_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done job extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done job merge")
def update(self, command): """ EXPECTING command == {"set":term, "where":where} THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES THE where CLAUSE IS AN ES FILTER """ command = to_data(command) table = self.get_table(command["update"]) es_index = self.es.cluster.get_index(read_only=False, alias=None, kwargs=self.es.settings) schema = table.schema # GET IDS OF DOCUMENTS query = { "from": command["update"], "select": [{ "value": "_id" }] + [{ "name": k, "value": v } for k, v in command.set.items()], "where": command.where, "format": "list", "limit": 10000, } results = self.query(query) if results.data: content = "".join(t for r in results.data for _id, row in [(r._id, r)] for _ in [row.__setitem__("_id", None) ] # WARNING! DESTRUCTIVE TO row for update in map(value2json, ({ "update": { "_id": _id } }, { "doc": row })) for t in (update, "\n")) response = self.es.cluster.post( es_index.path + "/" + "_bulk", data=content, timeout=self.settings.timeout, params={ "wait_for_active_shards": self.settings.wait_for_active_shards }, ) if response.errors: Log.error( "could not update: {{error}}", error=[ e.error for i in response["items"] for e in i.values() if e.status not in (200, 201) ], ) # DELETE BY QUERY, IF NEEDED if "." in listwrap(command["clear"]): es_filter = (ES52Lang[jx_expression( command.where)].partial_eval().to_es(schema)) self.es.delete_record(es_filter) return es_index.refresh()
def parse(json, query_path, expected_vars=NO_VARS): """ INTENDED TO TREAT JSON AS A STREAM; USING MINIMAL MEMORY WHILE IT ITERATES THROUGH THE STRUCTURE. ASSUMING THE JSON IS LARGE, AND HAS A HIGH LEVEL ARRAY STRUCTURE, IT WILL yield EACH OBJECT IN THAT ARRAY. NESTED ARRAYS ARE HANDLED BY REPEATING THE PARENT PROPERTIES FOR EACH MEMBER OF THE NESTED ARRAY. DEEPER NESTED PROPERTIES ARE TREATED AS PRIMITIVE VALUES; THE STANDARD JSON DECODER IS USED. LARGE MANY-PROPERTY OBJECTS CAN BE HANDLED BY `items()` :param json: SOME STRING-LIKE STRUCTURE THAT CAN ASSUME WE LOOK AT ONE CHARACTER AT A TIME, IN ORDER :param query_path: A DOT-SEPARATED STRING INDICATING THE PATH TO THE NESTED ARRAY OPTIONALLY, {"items":query_path} TO FURTHER ITERATE OVER PROPERTIES OF OBJECTS FOUND AT query_path :param expected_vars: REQUIRED PROPERTY NAMES, USED TO DETERMINE IF MORE-THAN-ONE PASS IS REQUIRED :return: RETURNS AN ITERATOR OVER ALL OBJECTS FROM ARRAY LOCATED AT query_path """ if hasattr(json, "read"): # ASSUME IT IS A STREAM temp = json def get_more(): return temp.read(MIN_READ_SIZE) json = List_usingStream(get_more) elif hasattr(json, "__call__"): json = List_usingStream(json) elif isinstance(json, GeneratorType): json = List_usingStream(json.next) else: Log.error( "Expecting json to be a stream, or a function that will return more bytes" ) def _iterate_list(index, c, parent_path, path, expected_vars): c, index = skip_whitespace(index) if c == b']': yield index return while True: if not path: index = _assign_token(index, c, expected_vars) c, index = skip_whitespace(index) if c == b']': yield index _done(parent_path) return elif c == b',': yield index c, index = skip_whitespace(index) else: for index in _decode_token(index, c, parent_path, path, expected_vars): c, index = skip_whitespace(index) if c == b']': yield index _done(parent_path) return elif c == b',': yield index c, index = skip_whitespace(index) def _done(parent_path): if len(parent_path) < len(done[0]): done[0] = parent_path def _decode_object(index, c, parent_path, query_path, expected_vars): if "." in expected_vars: if len(done[0]) <= len(parent_path) and all( d == p for d, p in zip(done[0], parent_path)): Log.error("Can not pick up more variables, iterator is done") if query_path: Log.error("Can not extract objects that contain the iteration", var=join_field(query_path)) index = _assign_token(index, c, expected_vars) # c, index = skip_whitespace(index) yield index return did_yield = False while True: c, index = skip_whitespace(index) if c == b',': continue elif c == b'"': name, index = simple_token(index, c) c, index = skip_whitespace(index) if c != b':': Log.error("Expecting colon") c, index = skip_whitespace(index) child_expected = needed(name, expected_vars) child_path = parent_path + [name] if any(child_expected): if not query_path: index = _assign_token(index, c, child_expected) elif query_path[0] == name: for index in _decode_token(index, c, child_path, query_path[1:], child_expected): did_yield = True yield index else: if len(done[0]) <= len(child_path): Log.error( "Can not pick up more variables, iterator over {{path}} is done", path=join_field(done[0])) index = _assign_token(index, c, child_expected) elif query_path and query_path[0] == name: for index in _decode_token(index, c, child_path, query_path[1:], child_expected): yield index else: index = jump_to_end(index, c) elif c == b"}": if not did_yield: yield index break def set_destination(expected_vars, value): for i, e in enumerate(expected_vars): if e is None: pass elif e == ".": destination[i] = value elif isinstance(value, Mapping): destination[i] = value[e] else: destination[i] = Null def _decode_object_items(index, c, parent_path, query_path, expected_vars): """ ITERATE THROUGH THE PROPERTIES OF AN OBJECT """ c, index = skip_whitespace(index) num_items = 0 while True: if c == b',': c, index = skip_whitespace(index) elif c == b'"': name, index = simple_token(index, c) if "name" in expected_vars: for i, e in enumerate(expected_vars): if e == "name": destination[i] = name c, index = skip_whitespace(index) if c != b':': Log.error("Expecting colon") c, index = skip_whitespace(index) child_expected = needed("value", expected_vars) index = _assign_token(index, c, child_expected) c, index = skip_whitespace(index) DEBUG and not num_items % 1000 and Log.note( "{{num}} items iterated", num=num_items) yield index num_items += 1 elif c == b"}": break def _decode_token(index, c, parent_path, query_path, expected_vars): if c == b'{': if query_path and query_path[0] == "$items": if any(expected_vars): for index in _decode_object_items(index, c, parent_path, query_path[1:], expected_vars): yield index else: index = jump_to_end(index, c) yield index elif not any(expected_vars): index = jump_to_end(index, c) yield index else: for index in _decode_object(index, c, parent_path, query_path, expected_vars): yield index elif c == b'[': for index in _iterate_list(index, c, parent_path, query_path, expected_vars): yield index else: index = _assign_token(index, c, expected_vars) yield index def _assign_token(index, c, expected_vars): if not any(expected_vars): return jump_to_end(index, c) value, index = simple_token(index, c) set_destination(expected_vars, value) return index def jump_to_end(index, c): """ DO NOT PROCESS THIS JSON OBJECT, JUST RETURN WHERE IT ENDS """ if c == b'"': while True: c = json[index] index += 1 if c == b'\\': index += 1 elif c == b'"': break return index elif c not in b"[{": while True: c = json[index] index += 1 if c in b',]}': break return index - 1 # OBJECTS AND ARRAYS ARE MORE INVOLVED stack = [None] * 1024 stack[0] = CLOSE[c] i = 0 # FOR INDEXING THE STACK while True: c = json[index] index += 1 if c == b'"': while True: c = json[index] index += 1 if c == b'\\': index += 1 elif c == b'"': break elif c in b'[{': i += 1 stack[i] = CLOSE[c] elif c == stack[i]: i -= 1 if i == -1: return index # FOUND THE MATCH! RETURN elif c in b']}': Log.error("expecting {{symbol}}", symbol=stack[i]) def simple_token(index, c): if c == b'"': json.mark(index - 1) while True: c = json[index] index += 1 if c == b"\\": index += 1 elif c == b'"': break return json_decoder(json.release(index).decode("utf8")), index elif c in b"{[": json.mark(index - 1) index = jump_to_end(index, c) value = wrap(json_decoder(json.release(index).decode("utf8"))) return value, index elif c == b"t" and json.slice(index, index + 3) == b"rue": return True, index + 3 elif c == b"n" and json.slice(index, index + 3) == b"ull": return None, index + 3 elif c == b"f" and json.slice(index, index + 4) == b"alse": return False, index + 4 else: json.mark(index - 1) while True: c = json[index] if c in b',]}': break index += 1 text = json.release(index) try: return float(text), index except Exception: Log.error("Not a known JSON primitive: {{text|quote}}", text=text) def skip_whitespace(index): """ RETURN NEXT NON-WHITESPACE CHAR, AND ITS INDEX """ c = json[index] while c in WHITESPACE: index += 1 c = json[index] return c, index + 1 if isinstance(query_path, Mapping) and query_path.get("items"): path_list = split_field(query_path.get("items")) + [ "$items" ] # INSERT A MARKER SO THAT OBJECT IS STREAM DECODED else: path_list = split_field(query_path) destination = [None] * len(expected_vars) c, index = skip_whitespace(0) done = [path_list + [None]] for _ in _decode_token(index, c, [], path_list, expected_vars): output = Data() for i, e in enumerate(expected_vars): output[e] = destination[i] yield output