class StructuredLogger_usingQueue(StructuredLogger): def __init__(self, name=None): queue_name = "log messages to queue" if name: queue_name += " " + name self.queue = Queue(queue_name) def write(self, template, params): self.queue.add(expand_template(template, params)) def stop(self): self.queue.close() def pop(self): lines = self.queue.pop() output = [] for l in lines.split(CR): # REMOVE FIRST PART, THE TIMESTAMP # 0123456789012345678901234567890 # 2019-01-06 19:13:49.937542 - prefix = re.match(DATE_PATTERN, l) if prefix: l = l[len(prefix.group(0)):] if not l.strip(): continue if l.strip().startswith("File"): continue output.append(l) return CR.join(output).strip()
def _find_revision(self, revision): please_stop = False locker = Lock() output = [] queue = Queue("branches", max=2000) queue.extend(b for b in self.branches if b.locale == DEFAULT_LOCALE and b.name in ["try", "mozilla-inbound", "autoland"]) queue.add(THREAD_STOP) problems = [] def _find(please_stop): for b in queue: if please_stop: return try: url = b.url + "json-info?node=" + revision rev = self.get_revision(Revision(branch=b, changeset={"id": revision})) with locker: output.append(rev) Log.note("Revision found at {{url}}", url=url) except Exception as f: problems.append(f) threads = [] for i in range(3): threads.append(Thread.run("find changeset " + text_type(i), _find, please_stop=please_stop)) for t in threads: with assert_no_exception: t.join() return output
class StructuredLogger_usingThread(StructuredLogger): def __init__(self, logger, period=PERIOD): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.logger = logger self.queue = Queue( "Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True, ) self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger, self.queue, period) # worker WILL BE RESPONSIBLE FOR THREAD stop() self.thread.parent.remove_child(self.thread) self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: e = Except.wrap(e) raise e # OH NO! def stop(self): try: self.queue.add( THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() except Exception as e: Log.note("problem in threaded logger" + str(e))
class StructuredLogger_usingThreadedStream(StructuredLogger): # stream CAN BE AN OBJCET WITH write() METHOD, OR A STRING # WHICH WILL eval() TO ONE def __init__(self, stream): assert stream use_UTF8 = False if isinstance(stream, basestring): if stream.startswith("sys."): use_UTF8 = True # sys.* ARE OLD AND CAN NOT HANDLE unicode self.stream = eval(stream) name = stream else: self.stream = stream name = "stream" # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD from mo_threads import Queue if use_UTF8: def utf8_appender(value): if isinstance(value, unicode): value = value.encode('utf8') self.stream.write(value) appender = utf8_appender else: appender = self.stream.write self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True) self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: raise e # OH NO! def stop(self): try: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() except Exception as e: if DEBUG_LOGGING: raise e try: self.queue.close() except Exception, f: if DEBUG_LOGGING: raise f
class StructuredLogger_usingThread(StructuredLogger): def __init__(self, logger): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True) self.logger = logger def worker(logger, please_stop): try: while not please_stop: logs = self.queue.pop_all() if not logs: (Till(seconds=1) | please_stop).wait() continue for log in logs: if log is THREAD_STOP: please_stop.go() else: logger.write(**log) except Exception as e: print("problem in " + StructuredLogger_usingThread.__name__ + ": " + str(e)) finally: Log.note("stop the child") logger.stop() self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger) self.thread.parent.remove_child( self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: e = Except.wrap(e) raise e # OH NO! def stop(self): try: self.queue.add( THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() Log.note("joined on thread") except Exception as e: Log.note("problem in threaded logger" + str(e)) with suppress_exception: self.queue.close()
class StructuredLogger_usingThreadedStream(StructuredLogger): # stream CAN BE AN OBJCET WITH write() METHOD, OR A STRING # WHICH WILL eval() TO ONE def __init__(self, stream): assert stream if is_text(stream): name = stream stream = self.stream = eval(stream) if name.startswith("sys.") and PY3: self.stream = Data(write=lambda d: stream.write(d.decode('utf8'))) else: name = "stream" self.stream = stream # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD from mo_threads import Queue def utf8_appender(value): if is_text(value): value = value.encode('utf8') self.stream.write(value) appender = utf8_appender self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True) self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: raise e # OH NO! def stop(self): try: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() except Exception as e: if DEBUG_LOGGING: raise e try: self.queue.close() except Exception as f: if DEBUG_LOGGING: raise f
class StructuredLogger_usingThreadedStream(StructuredLogger): # stream CAN BE AN OBJCET WITH write() METHOD, OR A STRING # WHICH WILL eval() TO ONE def __init__(self, stream): assert stream if isinstance(stream, text_type): name = stream stream = self.stream = eval(stream) if name.startswith("sys.") and PY3: self.stream = Data(write=lambda d: stream.write(d.decode('utf8'))) else: name = "stream" self.stream = stream # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD from mo_threads import Queue def utf8_appender(value): if isinstance(value, text_type): value = value.encode('utf8') self.stream.write(value) appender = utf8_appender self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True) self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: raise e # OH NO! def stop(self): try: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() except Exception as e: if DEBUG_LOGGING: raise e try: self.queue.close() except Exception as f: if DEBUG_LOGGING: raise f
class StructuredLogger_usingThread(StructuredLogger): def __init__(self, logger): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True) self.logger = logger def worker(logger, please_stop): try: while not please_stop: logs = self.queue.pop_all() if not logs: (Till(seconds=1) | please_stop).wait() continue for log in logs: if log is THREAD_STOP: please_stop.go() else: logger.write(**log) except Exception as e: print("problem in " + StructuredLogger_usingThread.__name__ + ": " + str(e)) finally: Log.note("stop the child") logger.stop() self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: e = Except.wrap(e) raise e # OH NO! def stop(self): try: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() Log.note("joined on thread") except Exception as e: Log.note("problem in threaded logger" + str(e)) with suppress_exception: self.queue.close()
class StructuredLogger_usingQueue(StructuredLogger): def __init__(self, name=None): queue_name = "log messages to queue" if name: queue_name += " " + name self.queue = Queue(queue_name) def write(self, template, params): self.queue.add(expand_template(template, params)) def stop(self): self.queue.close() def pop(self): lines = self.queue.pop() output = [] for l in lines.split("\n"): if l[19:22] == " - ": l = l[22:] if l.strip().startswith("File"): continue output.append(l) return "\n".join(output).strip()
class StructuredLogger_usingQueue(StructuredLogger): def __init__(self, name=None): queue_name = "log messages to queue" if name: queue_name += " "+name self.queue = Queue(queue_name) def write(self, template, params): self.queue.add(expand_template(template, params)) def stop(self): self.queue.close() def pop(self): lines = self.queue.pop() output = [] for l in lines.split("\n"): if l[19:22] == " - ": l = l[22:] if l.strip().startswith("File"): continue output.append(l) return "\n".join(output).strip()
class Sqlite(DB): """ Allows multi-threaded access Loads extension functions (like SQRT) """ canonical = None def __init__(self, filename=None, db=None, upgrade=True): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread-safe database """ if upgrade and not _upgraded: _upgrade() self.filename = filename self.db = db self.queue = Queue( "sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG self.upgrade = upgrade def _enhancements(self): def regex(pattern, value): return 1 if re.match(pattern + "$", value) else 0 con = self.db.create_function("regex", 2, regex) class Percentile(object): def __init__(self, percentile): self.percentile = percentile self.acc = [] def step(self, value): self.acc.append(value) def finalize(self): return percentile(self.acc, self.percentile) con.create_aggregate("percentile", 2, Percentile) def execute(self, command): """ COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS :param command: COMMAND FOR SQLITE :return: None """ if DEBUG: # EXECUTE IMMEDIATELY FOR BETTER STACK TRACE return self.query(command) if self.get_trace: trace = extract_stack(1) else: trace = None self.queue.add((command, None, None, trace)) def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ if not self.worker: self.worker = Thread.run("sqlite db thread", self._worker) signal = Signal() result = Data() self.queue.add((command, result, signal, None)) signal.wait() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def _worker(self, please_stop): global _load_extension_warning_sent if DEBUG: Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version) if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(coalesce(self.filename, ':memory:')) library_loc = File.new_instance(sys.modules[__name__].__file__, "../..") full_path = File.new_instance( library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath try: trace = extract_stack(0)[0] if self.upgrade: if os.name == 'nt': file = File.new_instance( trace["file"], "../../vendor/sqlite/libsqlitefunctions.so") else: file = File.new_instance( trace["file"], "../../vendor/sqlite/libsqlitefunctions") full_path = file.abspath self.db.enable_load_extension(True) self.db.execute("SELECT load_extension(" + self.quote_value(full_path) + ")") except Exception as e: if not _load_extension_warning_sent: _load_extension_warning_sent = True Log.warning( "Could not load {{file}}}, doing without. (no SQRT for you!)", file=full_path, cause=e) try: while not please_stop: command, result, signal, trace = self.queue.pop( till=please_stop) if DEBUG_INSERT and command.strip().lower().startswith( "insert"): Log.note("Running command\n{{command|indent}}", command=command) if DEBUG and not command.strip().lower().startswith("insert"): Log.note("Running command\n{{command|indent}}", command=command) with Timer("Run command", debug=DEBUG): if signal is not None: try: curr = self.db.execute(command) self.db.commit() result.meta.format = "table" result.header = [d[0] for d in curr.description ] if curr.description else None result.data = curr.fetchall() if DEBUG and result.data: text = convert.table2csv(list(result.data)) Log.note("Result:\n{{data}}", data=text) except Exception as e: e = Except.wrap(e) result.exception = Except( ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally: signal.go() else: try: self.db.execute(command) self.db.commit() except Exception as e: e = Except.wrap(e) e.cause = Except(type=ERROR, template="Bad call to Sqlite", trace=trace) Log.warning("Failure to execute", cause=e) except Exception as e: if not please_stop: Log.error("Problem with sql thread", e) finally: if DEBUG: Log.note("Database is closed") self.db.commit() self.db.close() def quote_column(self, column_name, table=None): return quote_column(column_name, table) def quote_value(self, value): return quote_value(value)
class Cache(object): """ For Caching hg.mo requests """ @override def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None): self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD) self.rate = coalesce(rate, HG_REQUEST_PER_SECOND) self.cache_locker = Lock() self.cache = {} # MAP FROM url TO (ready, headers, response, timestamp) PAIR self.no_cache = {} # VERY SHORT TERM CACHE self.workers = [] self.todo = Queue(APP_NAME+" todo") self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds)) self.url = URL(source.url) self.db = Sqlite(database) self.inbound_rate = RateLogger("Inbound") self.outbound_rate = RateLogger("hg.mo") if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data: with self.db.transaction() as t: t.execute( "CREATE TABLE cache (" " path TEXT PRIMARY KEY, " " headers TEXT, " " response TEXT, " " timestamp REAL " ")" ) self.threads = [ Thread.run(APP_NAME+" worker" + text_type(i), self._worker) for i in range(CONCURRENCY) ] self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter) self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner) def _rate_limiter(self, please_stop): try: max_requests = self.requests.max recent_requests = [] while not please_stop: now = Date.now() too_old = now - self.amortization_period recent_requests = [t for t in recent_requests if t > too_old] num_recent = len(recent_requests) if num_recent >= max_requests: space_free_at = recent_requests[0] + self.amortization_period (please_stop | Till(till=space_free_at.unix)).wait() continue for _ in xrange(num_recent, max_requests): request = self.todo.pop() now = Date.now() recent_requests.append(now) self.requests.add(request) except Exception as e: Log.warning("failure", cause=e) def _cache_cleaner(self, please_stop): while not please_stop: now = Date.now() too_old = now-CACHE_RETENTION remove = set() with self.cache_locker: for path, (ready, headers, response, timestamp) in self.cache: if timestamp < too_old: remove.add(path) for r in remove: del self.cache[r] (please_stop | Till(seconds=CACHE_RETENTION.seconds / 2)).wait() def please_cache(self, path): """ :return: False if `path` is not to be cached """ if path.endswith("/tip"): return False if any(k in path for k in ["/json-annotate/", "/json-info/", "/json-log/", "/json-rev/", "/rev/", "/raw-rev/", "/raw-file/", "/json-pushes", "/pushloghtml", "/file/"]): return True return False def request(self, method, path, headers): now = Date.now() self.inbound_rate.add(now) ready = Signal(path) # TEST CACHE with self.cache_locker: pair = self.cache.get(path) if pair is None: self.cache[path] = (ready, None, None, now) if pair is not None: # REQUEST IS IN THE QUEUE ALREADY, WAIT ready, headers, response, then = pair if response is None: ready.wait() with self.cache_locker: ready, headers, response, timestamp = self.cache.get(path) with self.db.transaction() as t: t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now)) return Response( response, status=200, headers=json.loads(headers) ) # TEST DB db_response = self.db.query("SELECT headers, response FROM cache WHERE path=" + quote_value(path)).data if db_response: headers, response = db_response[0] with self.db.transaction() as t: t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now)) with self.cache_locker: self.cache[path] = (ready, headers, response.encode('latin1'), now) ready.go() return Response( response, status=200, headers=json.loads(headers) ) # MAKE A NETWORK REQUEST self.todo.add((ready, method, path, headers, now)) ready.wait() with self.cache_locker: ready, headers, response, timestamp = self.cache[path] return Response( response, status=200, headers=json.loads(headers) ) def _worker(self, please_stop): while not please_stop: pair = self.requests.pop(till=please_stop) if please_stop: break ready, method, path, req_headers, timestamp = pair try: url = self.url / path self.outbound_rate.add(Date.now()) response = http.request(method, url, req_headers) del response.headers['transfer-encoding'] resp_headers = value2json(response.headers) resp_content = response.raw.read() please_cache = self.please_cache(path) if please_cache: with self.db.transaction() as t: t.execute("INSERT INTO cache (path, headers, response, timestamp) VALUES" + quote_list((path, resp_headers, resp_content.decode('latin1'), timestamp))) with self.cache_locker: self.cache[path] = (ready, resp_headers, resp_content, timestamp) except Exception as e: Log.warning("problem with request to {{path}}", path=path, cause=e) with self.cache_locker: ready, headers, response = self.cache[path] del self.cache[path] finally: ready.go()
class Sqlite(DB): """ Allows multi-threaded access Loads extension functions (like SQRT) """ canonical = None @override def __init__(self, filename=None, db=None, upgrade=True, load_functions=False, kwargs=None): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread-safe database """ if upgrade and not _upgraded: _upgrade() self.settings = kwargs self.filename = File(filename).abspath self.db = db self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = TRACE self.upgrade = upgrade self.closed = False if DEBUG: Log.note("Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0]) def _enhancements(self): def regex(pattern, value): return 1 if re.match(pattern+"$", value) else 0 con = self.db.create_function("regex", 2, regex) class Percentile(object): def __init__(self, percentile): self.percentile=percentile self.acc=[] def step(self, value): self.acc.append(value) def finalize(self): return percentile(self.acc, self.percentile) con.create_aggregate("percentile", 2, Percentile) def execute(self, command): """ COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS :param command: COMMAND FOR SQLITE :return: Signal FOR IF YOU WANT TO BE NOTIFIED WHEN DONE """ if self.closed: Log.error("database is closed") if DEBUG_EXECUTE: # EXECUTE IMMEDIATELY FOR BETTER STACK TRACE self.query(command) return DONE if self.get_trace: trace = extract_stack(1) else: trace = None is_done = Signal() self.queue.add((command, None, is_done, trace)) return is_done def commit(self): """ WILL BLOCK CALLING THREAD UNTIL ALL PREVIOUS execute() CALLS ARE COMPLETED :return: """ if self.closed: Log.error("database is closed") signal = _allocate_lock() signal.acquire() self.queue.add((COMMIT, None, signal, None)) signal.acquire() return def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ if self.closed: Log.error("database is closed") if not self.worker: self.worker = Thread.run("sqlite db thread", self._worker) signal = _allocate_lock() signal.acquire() result = Data() self.queue.add((command, result, signal, None)) signal.acquire() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def close(self): """ OPTIONAL COMMIT-AND-CLOSE IF THIS IS NOT DONE, THEN THE THREAD THAT SPAWNED THIS INSTANCE :return: """ self.closed = True signal = _allocate_lock() signal.acquire() self.queue.add((COMMIT, None, signal, None)) signal.acquire() self.worker.please_stop.go() return def __enter__(self): pass def __exit__(self, exc_type, exc_val, exc_tb): self.close() def _worker(self, please_stop): global _load_extension_warning_sent try: if DEBUG: Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version) try: if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(coalesce(self.filename, ':memory:'), check_same_thread=False) except Exception as e: Log.error("could not open file {{filename}}", filename=self.filename) if self.settings.load_functions: self._load_functions() while not please_stop: quad = self.queue.pop(till=please_stop) if quad is None: break command, result, signal, trace = quad show_timing = False if DEBUG_INSERT and command.strip().lower().startswith("insert"): Log.note("Running command\n{{command|limit(100)|indent}}", command=command) show_timing = True if DEBUG and not command.strip().lower().startswith("insert"): Log.note("Running command\n{{command|limit(100)|indent}}", command=command) show_timing = True with Timer("SQL Timing", silent=not show_timing): if command is COMMIT: self.db.commit() signal.release() elif signal is not None: try: curr = self.db.execute(command) if result is not None: result.meta.format = "table" result.header = [d[0] for d in curr.description] if curr.description else None result.data = curr.fetchall() if DEBUG and result.data: text = convert.table2csv(list(result.data)) Log.note("Result:\n{{data}}", data=text) except Exception as e: e = Except.wrap(e) e.cause = Except( type=ERROR, template="Bad call to Sqlite", trace=trace ) if result is None: Log.error("Problem with\n{{command|indent}}", command=command, cause=e) else: result.exception = Except(ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally: if isinstance(signal, Signal): signal.go() else: signal.release() else: try: self.db.execute(command) except Exception as e: e = Except.wrap(e) e.cause = Except( type=ERROR, template="Bad call to Sqlite", trace=trace ) Log.warning("Failure to execute", cause=e) except Exception as e: if not please_stop: Log.warning("Problem with sql thread", cause=e) finally: self.closed = True if DEBUG: Log.note("Database is closed") self.db.close() def _load_functions(self): global _load_extension_warning_sent library_loc = File.new_instance(sys.modules[__name__].__file__, "../..") full_path = File.new_instance(library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath try: trace = extract_stack(0)[0] if self.upgrade: if os.name == 'nt': file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions.so") else: file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions") full_path = file.abspath self.db.enable_load_extension(True) self.db.execute(SQL_SELECT + "load_extension" + sql_iso(quote_value(full_path))) except Exception as e: if not _load_extension_warning_sent: _load_extension_warning_sent = True Log.warning("Could not load {{file}}, doing without. (no SQRT for you!)", file=full_path, cause=e) def create_new_functions(self): def regexp(pattern, item): reg = re.compile(pattern) return reg.search(item) is not None self.db.create_function("REGEXP", 2, regexp)
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ @override def __new__(cls, kwargs, *args, **_kwargs): es_cluster = elasticsearch.Cluster(kwargs) output = known_clusters.get(id(es_cluster)) if output is None: output = object.__new__(cls) known_clusters[id(es_cluster)] = output return output @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = {} self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList(URL(self.es_cluster.settings.host).host) self.alias_to_query_paths = { "meta.columns": [ROOT_PATH], "meta.tables": [ROOT_PATH] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [], jx_base.Schema(".", table_columns) ) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("not refresh metadata", self.not_monitor) return @property def namespace(self): return self.meta.columns.namespace @property def url(self): return self.es_cluster.url / self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index es_metadata_update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata(force=es_metadata_update_required) props = [ (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if alias in d.aliases for t, m in [_get_best_type_from_mapping(d.mappings)] ] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} columns = self._parse_properties(alias, mapping) table_desc.timestamp = es_last_updated return columns def _parse_properties(self, alias, mapping): abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties) if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns): Log.warning( "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}", url=self.es_cluster.url, index=alias, names=[ ".".join((c.es_index, c.name)) for c in abs_columns if c.cardinality == 0 ] ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ENSURE ALL TABLES HAVE THE QUERY PATHS SET self.alias_to_query_paths[alias] = query_paths for i, a in self.index_to_alias.items(): if a == alias: self.alias_to_query_paths[i] = query_paths # ENSURE COLUMN HAS CORRECT jx_type # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE output = [] best = {} for abs_column in abs_columns: abs_column.jx_type = jx_type(abs_column) if abs_column.jx_type not in STRUCT: clean_name = unnest_path(abs_column.name) other = best.get(clean_name) if other: if len(other.nested_path) < len(abs_column.nested_path): output.remove(other) self.meta.columns.update({"clear": ".", "where": {"eq": {"es_column": other.es_column, "es_index": other.es_index}}}) else: continue best[clean_name] = abs_column output.append(abs_column) # REGISTER ALL COLUMNS canonicals = [] for abs_column in output: canonical = self.meta.columns.add(abs_column) canonicals.append(canonical) self.todo.extend(canonicals) return canonicals def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias.get(name) def get_columns(self, table_name, column_name=None, after=None, timeout=None): """ RETURN METADATA COLUMNS :param table_name: TABLE WE WANT COLUMNS FOR :param column_name: OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME :param timeout: Signal; True when should give up :return: """ DEBUG and after and Log.note("getting columns for after {{time}}", time=after) table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc( name=alias, url=None, query_path=["."], timestamp=Date.MIN ) with self.meta.tables.locker: self.meta.tables.add(table) columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") elif after or table.timestamp < self.es_cluster.metatdata_last_updated: columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") else: columns = self.meta.columns.find(alias, column_name) DEBUG and Log.note("columns from find()") DEBUG and Log.note("columns are {{ids}}", ids=[id(c) for c in columns]) columns = jx.sort(columns, "name") if after is None: return columns # DO NOT WAIT FOR COMPLETE COLUMNS # WAIT FOR THE COLUMNS TO UPDATE while True: pending = [c for c in columns if after >= c.last_updated or (c.cardinality == None and c.jx_type not in STRUCT)] if not pending: break if timeout: Log.error("trying to gets columns timed out") if DEBUG: if len(pending) > 10: Log.note("waiting for {{num}} columns to update by {{timestamp}}", num=len(pending), timestamp=after) else: Log.note("waiting for columns to update by {{timestamp}}; {{columns|json}}", timestamp=after, columns=[c.es_index + "." + c.es_column + " id="+text_type(id(c)) for c in pending]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Failure to get columns for {{table}}", table=table_name, cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ now = Date.now() if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": [False, True], "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return else: es_query = { "aggs": { "count": _counting_query(column), "_filter": { "aggs": {"multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}}, "filter": {"bool": {"should": [ {"range": {"etl.timestamp.~n~": {"gte": (Date.today() - WEEK)}}}, {"bool": {"must_not": {"exists": {"field": "etl.timestamp.~n~"}}}} ]}} } }, "size": 0 } result = self.es_cluster.post("/" + es_index + "/_search", data=es_query) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results._filter.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: # WHEN DOES THIS HAPPEN? query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) DEBUG and Log.note("update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index: # WE EXPECT TEST TABLES TO DISAPPEAR Log.warning("Missing index {{col.es_index}}", col=column, cause=e) self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) elif "No field found for" in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) else: self.meta.columns.update({ "set": { "last_updated": now }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[Date(t).format() for t in wrap(old_columns).last_updated] ) self.todo.extend(old_columns) else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG): if column.es_index in self.index_does_not_exist: DEBUG and Log.note("{{column.es_column}} does not exist", column=column) self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) continue if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) else: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: column = self.todo.pop() if column == THREAD_STOP: break if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if untype_path(column.name) in ["build.type", "run.type"]: try: self._update_cardinality(column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) else: column.last_updated = Date.now() def get_table(self, name): if name == "meta.columns": return self.meta.columns with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema if name == "meta.tables": return self.meta.tables root, rest = tail_field(name) return self.get_snowflake(root).get_schema(rest)
class ColumnList(Table, jx_base.Container): """ OPTIMIZED FOR THE PARTICULAR ACCESS PATTERNS USED """ def __init__(self, es_cluster): Table.__init__(self, META_COLUMNS_NAME) self.data = {} # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS) self.locker = Lock() self._schema = None self.dirty = False self.es_cluster = es_cluster self.es_index = None self.last_load = Null self.todo = Queue( "update columns to es" ) # HOLD (action, column) PAIR, WHERE action in ['insert', 'update'] self._db_load() Thread.run("update " + META_COLUMNS_NAME, self._update_from_es, parent_thread=MAIN_THREAD) def _query(self, query): result = Data() curr = self.es_cluster.execute(query) result.meta.format = "table" result.header = [d[0] for d in curr.description ] if curr.description else None result.data = curr.fetchall() return result def _db_create(self): schema = { "settings": { "index.number_of_shards": 1, "index.number_of_replicas": 6 }, "mappings": { META_COLUMNS_TYPE_NAME: {} }, } self.es_index = self.es_cluster.create_index(id=ID, index=META_COLUMNS_NAME, schema=schema) self.es_index.add_alias(META_COLUMNS_NAME) for c in META_COLUMNS_DESC.columns: self._add(c) self.es_index.add({"value": c.__dict__()}) def _db_load(self): self.last_load = Date.now() try: self.es_index = self.es_cluster.get_index( id=ID, index=META_COLUMNS_NAME, type=META_COLUMNS_TYPE_NAME, read_only=False) result = self.es_index.search({ "query": { "bool": { "should": [ { "bool": { "must_not": { "exists": { "field": "cardinality.~n~" } } } }, { # ASSUME UNUSED COLUMNS DO NOT EXIST "range": { "cardinality.~n~": { "gt": 0 } } }, ] } }, "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"], "size": 10000, }) Log.note("{{num}} columns loaded", num=result.hits.total) with self.locker: for r in result.hits.hits._source: self._add(doc_to_column(r)) except Exception as e: Log.warning("no {{index}} exists, making one", index=META_COLUMNS_NAME, cause=e) self._db_create() def _update_from_es(self, please_stop): try: last_extract = Date.now() while not please_stop: now = Date.now() try: if (now - last_extract).seconds > COLUMN_EXTRACT_PERIOD: result = self.es_index.search({ "query": { "range": { "last_updated.~n~": { "gte": self.last_load } } }, "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"], "from": 0, "size": 10000, }) last_extract = now with self.locker: for r in result.hits.hits._source: c = doc_to_column(r) self._add(c) self.last_load = MAX( (self.last_load, c.last_updated)) while not please_stop: updates = self.todo.pop_all() if not updates: break DEBUG and updates and Log.note( "{{num}} columns to push to db", num=len(updates)) self.es_index.extend([{ "value": column.__dict__() } for column in updates]) except Exception as e: Log.warning("problem updating database", cause=e) (Till(seconds=COLUMN_LOAD_PERIOD) | please_stop).wait() finally: Log.note("done") def find(self, es_index, abs_column_name=None): with self.locker: if es_index.startswith("meta."): self._update_meta() if not abs_column_name: return [ c for cs in self.data.get(es_index, {}).values() for c in cs ] else: return self.data.get(es_index, {}).get(abs_column_name, []) def extend(self, columns): self.dirty = True with self.locker: for column in columns: self._add(column) def add(self, column): self.dirty = True with self.locker: canonical = self._add(column) if canonical == None: return column # ALREADY ADDED self.todo.add(canonical) return canonical def remove_table(self, table_name): del self.data[table_name] def _add(self, column): """ :param column: ANY COLUMN OBJECT :return: None IF column IS canonical ALREADY (NET-ZERO EFFECT) """ columns_for_table = self.data.setdefault(column.es_index, {}) existing_columns = columns_for_table.setdefault(column.name, []) for canonical in existing_columns: if canonical is column: return None if canonical.es_type == column.es_type: if column.last_updated > canonical.last_updated: for key in Column.__slots__: old_value = canonical[key] new_value = column[key] if new_value == old_value: pass # NO NEED TO UPDATE WHEN NO CHANGE MADE (COMMON CASE) else: canonical[key] = new_value return canonical existing_columns.append(column) return column def _update_meta(self): if not self.dirty: return now = Date.now() for mc in META_COLUMNS_DESC.columns: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.name] if value == None: pass else: count += 1 if is_list(value): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif is_data(value): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = now META_COLUMNS_DESC.last_updated = now self.dirty = False def _all_columns(self): return [ column for t, cs in self.data.items() for _, css in cs.items() for column in css ] def __iter__(self): with self.locker: self._update_meta() return iter(self._all_columns()) def __len__(self): return self.data[META_COLUMNS_NAME]["es_index"].count def update(self, command): self.dirty = True try: command = wrap(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": d = self.data i = eq.es_index with self.locker: cols = d[i] del d[i] for c in cols: mark_as_deleted(c) self.todo.add(c) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all(c[k] == v for k, v in eq.items()) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": mark_as_deleted(col) self.todo.add(col) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.todo.add(col) except Exception as e: Log.error("should not happen", cause=e) def query(self, query): # NOT EXPECTED TO BE RUN Log.error("not") with self.locker: self._update_meta() if not self._schema: self._schema = Schema(".", [ c for cs in self.data[META_COLUMNS_NAME].values() for c in cs ]) snapshot = self._all_columns() from jx_python.containers.list_usingPythonList import ListContainer query.frum = ListContainer(META_COLUMNS_NAME, snapshot, self._schema) return jx.run(query) def groupby(self, keys): with self.locker: self._update_meta() return jx.groupby(self.__iter__(), keys) def window(self, window): raise NotImplemented() @property def schema(self): if not self._schema: with self.locker: self._update_meta() self._schema = Schema(".", [ c for cs in self.data[META_COLUMNS_NAME].values() for c in cs ]) return self._schema @property def namespace(self): return self def get_table(self, table_name): if table_name != META_COLUMNS_NAME: Log.error("this container has only the " + META_COLUMNS_NAME) return self def get_columns(self, table_name): if table_name != META_COLUMNS_NAME: Log.error("this container has only the " + META_COLUMNS_NAME) return self._all_columns() def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ with self.locker: self._update_meta() output = [ { "table": c.es_index, "name": untype_path(c.name), "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "es_type": c.es_type, "type": c.jx_type, } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.jx_type not in STRUCT # and c.es_column != "_id" ] from jx_python.containers.list_usingPythonList import ListContainer return ListContainer( self.name, data=output, schema=jx_base.Schema(META_COLUMNS_NAME, SIMPLE_METADATA_COLUMNS), )
class Clogger: # Singleton of the look-ahead scanner Clogger SINGLE_CLOGGER = None def __new__(cls, *args, **kwargs): if cls.SINGLE_CLOGGER is None: cls.SINGLE_CLOGGER = object.__new__(cls) return cls.SINGLE_CLOGGER def __init__(self, conn=None, tuid_service=None, start_workers=True, new_table=False, kwargs=None): try: self.config = kwargs self.conn = conn if conn else sql.Sql(self.config.database.name) self.hg_cache = HgMozillaOrg( kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService( kwargs=self.config.tuid, conn=self.conn, clogger=self) self.rev_locker = Lock() self.working_locker = Lock() if new_table: with self.conn.transaction() as t: t.execute("DROP TABLE IF EXISTS csetLog") self.init_db() self.next_revnum = coalesce( self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1) self.csets_todo_backwards = Queue( name="Clogger.csets_todo_backwards") self.deletions_todo = Queue(name="Clogger.deletions_todo") self.maintenance_signal = Signal(name="Clogger.maintenance_signal") if 'tuid' in self.config: self.config = self.config.tuid self.disable_backfilling = False self.disable_tipfilling = False self.disable_deletion = False self.disable_maintenance = False self.backfill_thread = None self.tipfill_thread = None self.deletion_thread = None self.maintenance_thread = None # Make sure we are filled before allowing queries numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] if numrevs < MINIMUM_PERMANENT_CSETS: Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS) oldest_rev = 'tip' with self.conn.transaction() as t: tmp = t.query( "SELECT min(revnum), revision FROM csetLog").data[0][1] if tmp: oldest_rev = tmp self._fill_in_range(MINIMUM_PERMANENT_CSETS - numrevs, oldest_rev, timestamp=False) Log.note("Table is filled with atleast {{minim}} entries.", minim=MINIMUM_PERMANENT_CSETS) if start_workers: self.start_workers() except Exception as e: Log.warning("Cannot setup clogger: {{cause}}", cause=str(e)) def start_backfilling(self): if not self.backfill_thread: self.backfill_thread = Thread.run('clogger-backfill', self.fill_backward_with_list) def start_tipfillling(self): if not self.tipfill_thread: self.tipfill_thread = Thread.run('clogger-tip', self.fill_forward_continuous) def start_maintenance(self): if not self.maintenance_thread: self.maintenance_thread = Thread.run('clogger-maintenance', self.csetLog_maintenance) def start_deleter(self): if not self.deletion_thread: self.deletion_thread = Thread.run('clogger-deleter', self.csetLog_deleter) def start_workers(self): self.start_tipfillling() self.start_backfilling() self.start_maintenance() self.start_deleter() Log.note("Started clogger workers.") def init_db(self): with self.conn.transaction() as t: t.execute(''' CREATE TABLE IF NOT EXISTS csetLog ( revnum INTEGER PRIMARY KEY, revision CHAR(12) NOT NULL, timestamp INTEGER );''') def disable_all(self): self.disable_tipfilling = True self.disable_backfilling = True self.disable_maintenance = True self.disable_deletion = True def revnum(self): """ :return: max revnum that was added """ return coalesce( self.conn.get_one("SELECT max(revnum) as revnum FROM csetLog")[0], 0) def get_tip(self, transaction): return transaction.get_one( "SELECT max(revnum) as revnum, revision FROM csetLog") def get_tail(self, transaction): return transaction.get_one( "SELECT min(revnum) as revnum, revision FROM csetLog") def _get_clog(self, clog_url): try: Log.note("Searching through changelog {{url}}", url=clog_url) clog_obj = http.get_json(clog_url, retry=RETRY) return clog_obj except Exception as e: Log.error( "Unexpected error getting changset-log for {{url}}: {{error}}", url=clog_url, error=e) def _get_one_revision(self, transaction, cset_entry): # Returns a single revision if it exists _, rev, _ = cset_entry return transaction.get_one( "SELECT revision FROM csetLog WHERE revision=?", (rev, )) def _get_one_revnum(self, transaction, rev): # Returns a single revnum if it exists return transaction.get_one( "SELECT revnum FROM csetLog WHERE revision=?", (rev, )) def _get_revnum_range(self, transaction, revnum1, revnum2): # Returns a range of revision numbers (that is inclusive) high_num = max(revnum1, revnum2) low_num = min(revnum1, revnum2) return transaction.query("SELECT revnum, revision FROM csetLog WHERE " "revnum >= " + str(low_num) + " AND revnum <= " + str(high_num)).data def recompute_table_revnums(self): ''' Recomputes the revnums for the csetLog table by creating a new table, and copying csetLog to it. The INTEGER PRIMARY KEY in the temp table auto increments as rows are added. IMPORTANT: Only call this after acquiring the lock `self.working_locker`. :return: ''' with self.conn.transaction() as t: t.execute(''' CREATE TABLE temp ( revnum INTEGER PRIMARY KEY, revision CHAR(12) NOT NULL, timestamp INTEGER );''') t.execute( "INSERT INTO temp (revision, timestamp) " "SELECT revision, timestamp FROM csetlog ORDER BY revnum ASC") t.execute("DROP TABLE csetLog;") t.execute("ALTER TABLE temp RENAME TO csetLog;") def check_for_maintenance(self): ''' Returns True if the maintenance worker should be run now, and False otherwise. :return: ''' numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] Log.note("Number of csets in csetLog table: {{num}}", num=numrevs) if numrevs >= SIGNAL_MAINTENANCE_CSETS: return True return False def add_cset_entries(self, ordered_rev_list, timestamp=False, number_forward=True): ''' Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered based on how changesets are found in the changelog. Going forwards or backwards is dealt with by flipping the list :param ordered_cset_list: Order given from changeset log searching. :param timestamp: If false, records are kept indefinitely but if holes exist: (delete, None, delete, None) those delete's with None's around them will not be deleted. :param numbered: If True, this function will number the revision list by going forward from max(revNum), else it'll go backwards from revNum, then add X to all revnums and self.next_revnum where X is the length of ordered_rev_list :return: ''' with self.conn.transaction() as t: current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0] current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0] if not current_min or not current_max: current_min = 0 current_max = 0 direction = -1 start = current_min - 1 if number_forward: direction = 1 start = current_max + 1 ordered_rev_list = ordered_rev_list[::-1] insert_list = [(start + direction * count, rev, int(time.time()) if timestamp else -1) for count, rev in enumerate(ordered_rev_list)] # In case of overlapping requests fmt_insert_list = [] for cset_entry in insert_list: tmp = self._get_one_revision(t, cset_entry) if not tmp: fmt_insert_list.append(cset_entry) for _, tmp_insert_list in jx.groupby(fmt_insert_list, size=SQL_CSET_BATCH_SIZE): t.execute( "INSERT INTO csetLog (revnum, revision, timestamp)" + " VALUES " + sql_list( quote_set((revnum, revision, timestamp)) for revnum, revision, timestamp in tmp_insert_list)) # Move the revision numbers forward if needed self.recompute_table_revnums() # Start a maintenance run if needed if self.check_for_maintenance(): Log.note("Scheduling maintenance run on clogger.") self.maintenance_signal.go() def _fill_in_range(self, parent_cset, child_cset, timestamp=False, number_forward=True): ''' Fills cset logs in a certain range. 'parent_cset' can be an int and in that case, we get that many changesets instead. If parent_cset is an int, then we consider that we are going backwards (number_forward is False) and we ignore the first changeset of the first log, and we ignore the setting for number_forward. Otherwise, we continue until we find the given 'parent_cset'. :param parent_cset: :param child_cset: :param timestamp: :param number_forward: :return: ''' csets_to_add = [] found_parent = False find_parent = False if type(parent_cset) != int: find_parent = True elif parent_cset >= MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG: Log.warning( "Requested number of new changesets {{num}} is too high. " "Max number that can be requested is {{maxnum}}.", num=parent_cset, maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG) return None csets_found = 0 clogs_seen = 0 final_rev = child_cset while not found_parent and clogs_seen < MAX_BACKFILL_CLOGS: clog_url = str( HG_URL ) + "/" + self.config.hg.branch + "/json-log/" + final_rev clog_obj = self._get_clog(clog_url) clog_csets_list = list(clog_obj['changesets']) for clog_cset in clog_csets_list[:-1]: if not number_forward and csets_found <= 0: # Skip this entry it already exists csets_found += 1 continue nodes_cset = clog_cset['node'][:12] if find_parent: if nodes_cset == parent_cset: found_parent = True if not number_forward: # When going forward this entry is # the given parent csets_to_add.append(nodes_cset) break else: if csets_found + 1 > parent_cset: found_parent = True if not number_forward: # When going forward this entry is # the given parent (which is supposed # to already exist) csets_to_add.append(nodes_cset) break csets_found += 1 csets_to_add.append(nodes_cset) if found_parent == True: break clogs_seen += 1 final_rev = clog_csets_list[-1]['node'][:12] if found_parent: self.add_cset_entries(csets_to_add, timestamp=timestamp, number_forward=number_forward) else: Log.warning( "Couldn't find the end of the request for {{request}}. " "Max number that can be requested through _fill_in_range is {{maxnum}}.", request={ 'parent_cset': parent_cset, 'child_cset': child_cset, 'number_forward': number_forward }, maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG) return None return csets_to_add def initialize_to_range(self, old_rev, new_rev, delete_old=True): ''' Used in service testing to get to very old changesets quickly. :param old_rev: The oldest revision to keep :param new_rev: The revision to start searching from :return: ''' old_settings = [ self.disable_tipfilling, self.disable_backfilling, self.disable_maintenance, self.disable_deletion ] self.disable_tipfilling = True self.disable_backfilling = True self.disable_maintenance = True self.disable_deletion = True old_rev = old_rev[:12] new_rev = new_rev[:12] with self.working_locker: if delete_old: with self.conn.transaction() as t: t.execute("DELETE FROM csetLog") with self.conn.transaction() as t: t.execute("INSERT INTO csetLog (revision, timestamp) VALUES " + quote_set((new_rev, -1))) self._fill_in_range(old_rev, new_rev, timestamp=True, number_forward=False) self.disable_tipfilling = old_settings[0] self.disable_backfilling = old_settings[1] self.disable_maintenance = old_settings[2] self.disable_deletion = old_settings[3] def fill_backward_with_list(self, please_stop=None): ''' Expects requests of the tuple form: (parent_cset, timestamp) parent_cset can be an int X to go back by X changesets, or a string to search for going backwards in time. If timestamp is false, no timestamps will be added to the entries. :param please_stop: :return: ''' while not please_stop: try: request = self.csets_todo_backwards.pop(till=please_stop) if please_stop: break # If backfilling is disabled, all requests # are ignored. if self.disable_backfilling: Till(till=CSET_BACKFILL_WAIT_TIME).wait() continue if request: parent_cset, timestamp = request else: continue with self.working_locker: with self.conn.transaction() as t: parent_revnum = self._get_one_revnum(t, parent_cset) if parent_revnum: continue with self.conn.transaction() as t: _, oldest_revision = self.get_tail(t) self._fill_in_range(parent_cset, oldest_revision, timestamp=timestamp, number_forward=False) Log.note("Finished {{cset}}", cset=parent_cset) except Exception as e: Log.warning("Unknown error occurred during backfill: ", cause=e) def update_tip(self): ''' Returns False if the tip is already at the newest, or True if an update has taken place. :return: ''' clog_obj = self._get_clog( str(HG_URL) + "/" + self.config.hg.branch + "/json-log/tip") # Get current tip in DB with self.conn.transaction() as t: _, newest_known_rev = self.get_tip(t) # If we are still at the newest, wait for CSET_TIP_WAIT_TIME seconds # before checking again. first_clog_entry = clog_obj['changesets'][0]['node'][:12] if newest_known_rev == first_clog_entry: return False csets_to_gather = None if not newest_known_rev: Log.note( "No revisions found in table, adding {{minim}} entries...", minim=MINIMUM_PERMANENT_CSETS) csets_to_gather = MINIMUM_PERMANENT_CSETS found_newest_known = False csets_to_add = [] csets_found = 0 clogs_seen = 0 Log.note("Found new revisions. Updating csetLog tip to {{rev}}...", rev=first_clog_entry) while not found_newest_known and clogs_seen < MAX_TIPFILL_CLOGS: clog_csets_list = list(clog_obj['changesets']) for clog_cset in clog_csets_list[:-1]: nodes_cset = clog_cset['node'][:12] if not csets_to_gather: if nodes_cset == newest_known_rev: found_newest_known = True break else: if csets_found >= csets_to_gather: found_newest_known = True break csets_found += 1 csets_to_add.append(nodes_cset) if not found_newest_known: # Get the next page clogs_seen += 1 final_rev = clog_csets_list[-1]['node'][:12] clog_url = str( HG_URL ) + "/" + self.config.hg.branch + "/json-log/" + final_rev clog_obj = self._get_clog(clog_url) if clogs_seen >= MAX_TIPFILL_CLOGS: Log.error( "Too many changesets, can't find last tip or the number is too high: {{rev}}. " "Maximum possible to request is {{maxnum}}", rev=coalesce(newest_known_rev, csets_to_gather), maxnum=MAX_TIPFILL_CLOGS * CHANGESETS_PER_CLOG) return False with self.working_locker: Log.note("Adding {{csets}}", csets=csets_to_add) self.add_cset_entries(csets_to_add, timestamp=False) return True def fill_forward_continuous(self, please_stop=None): while not please_stop: try: while not please_stop and not self.disable_tipfilling and self.update_tip( ): pass (please_stop | Till(seconds=CSET_TIP_WAIT_TIME)).wait() except Exception as e: Log.warning("Unknown error occurred during tip filling:", cause=e) def csetLog_maintenance(self, please_stop=None): ''' Handles deleting old csetLog entries and timestamping revisions once they pass the length for permanent storage for deletion later. :param please_stop: :return: ''' while not please_stop: try: # Wait until something signals the maintenance cycle # to begin (or end). (self.maintenance_signal | please_stop).wait() if please_stop: break if self.disable_maintenance: continue Log.warning( "Starting clog maintenance. Since this doesn't start often, " "we need to explicitly see when it's started with this warning." ) # Reset signal so we don't request # maintenance infinitely. with self.maintenance_signal.lock: self.maintenance_signal._go = False with self.working_locker: all_data = None with self.conn.transaction() as t: all_data = sorted(t.get( "SELECT revnum, revision, timestamp FROM csetLog"), key=lambda x: int(x[0])) # Restore maximum permanents (if overflowing) new_data = [] modified = False for count, (revnum, revision, timestamp) in enumerate(all_data[::-1]): if count < MINIMUM_PERMANENT_CSETS: if timestamp != -1: modified = True new_data.append((revnum, revision, -1)) else: new_data.append((revnum, revision, timestamp)) elif type(timestamp) != int or timestamp == -1: modified = True new_data.append( (revnum, revision, int(time.time()))) else: new_data.append((revnum, revision, timestamp)) # Delete annotations at revisions with timestamps # that are too old. The csetLog entries will have # their timestamps reset here. new_data1 = [] annrevs_to_del = [] current_time = time.time() for count, (revnum, revision, timestamp) in enumerate(new_data[::-1]): new_timestamp = timestamp if timestamp != -1: if current_time >= timestamp + TIME_TO_KEEP_ANNOTATIONS.seconds: modified = True new_timestamp = current_time annrevs_to_del.append(revision) new_data1.append((revnum, revision, new_timestamp)) if len(annrevs_to_del) > 0: # Delete any latestFileMod and annotation entries # that are too old. Log.note( "Deleting annotations and latestFileMod for revisions for being " "older than {{oldest}}: {{revisions}}", oldest=TIME_TO_KEEP_ANNOTATIONS, revisions=annrevs_to_del) with self.conn.transaction() as t: t.execute( "DELETE FROM latestFileMod WHERE revision IN " + quote_set(annrevs_to_del)) t.execute( "DELETE FROM annotations WHERE revision IN " + quote_set(annrevs_to_del)) # Delete any overflowing entries new_data2 = new_data1 reved_all_data = all_data[::-1] deleted_data = reved_all_data[MAXIMUM_NONPERMANENT_CSETS:] delete_overflowing_revstart = None if len(deleted_data) > 0: _, delete_overflowing_revstart, _ = deleted_data[0] new_data2 = set(all_data) - set(deleted_data) # Update old frontiers if requested, otherwise # they will all get deleted by the csetLog_deleter # worker if UPDATE_VERY_OLD_FRONTIERS: _, max_revision, _ = all_data[-1] for _, revision, _ in deleted_data: with self.conn.transaction() as t: old_files = t.get( "SELECT file FROM latestFileMod WHERE revision=?", (revision, )) if old_files is None or len(old_files) <= 0: continue self.tuid_service.get_tuids_from_files( old_files, max_revision, going_forward=True, ) still_exist = True while still_exist and not please_stop: Till(seconds=TUID_EXISTENCE_WAIT_TIME ).wait() with self.conn.transaction() as t: old_files = t.get( "SELECT file FROM latestFileMod WHERE revision=?", (revision, )) if old_files is None or len( old_files) <= 0: still_exist = False # Update table and schedule a deletion if modified: with self.conn.transaction() as t: insert_into_db_chunked( t, new_data2, "INSERT OR REPLACE INTO csetLog (revnum, revision, timestamp) VALUES " ) if not deleted_data: continue Log.note("Scheduling {{num_csets}} for deletion", num_csets=len(deleted_data)) self.deletions_todo.add(delete_overflowing_revstart) except Exception as e: Log.warning( "Unexpected error occured while maintaining csetLog, continuing to try: ", cause=e) return def csetLog_deleter(self, please_stop=None): ''' Deletes changesets from the csetLog table and also changesets from the annotation table that have revisions matching the given changesets. Accepts lists of csets from self.deletions_todo. :param please_stop: :return: ''' while not please_stop: try: request = self.deletions_todo.pop(till=please_stop) if please_stop: break # If deletion is disabled, ignore the current # request - it will need to be re-requested. if self.disable_deletion: Till(till=CSET_DELETION_WAIT_TIME).wait() continue with self.working_locker: first_cset = request # Since we are deleting and moving stuff around in the # TUID tables, we need everything to be contained in # one transaction with no interruptions. with self.conn.transaction() as t: revnum = self._get_one_revnum(t, first_cset)[0] csets_to_del = t.get( "SELECT revnum, revision FROM csetLog WHERE revnum <= ?", (revnum, )) csets_to_del = [cset for _, cset in csets_to_del] existing_frontiers = t.query( "SELECT revision FROM latestFileMod WHERE revision IN " + quote_set(csets_to_del)).data existing_frontiers = [ existing_frontiers[i][0] for i, _ in enumerate(existing_frontiers) ] Log.note( "Deleting all annotations and changeset log entries with revisions in the list: {{csets}}", csets=csets_to_del) if len(existing_frontiers) > 0: # This handles files which no longer exist anymore in # the main branch. Log.note( "Deleting existing frontiers for revisions: {{revisions}}", revisions=existing_frontiers) t.execute( "DELETE FROM latestFileMod WHERE revision IN " + quote_set(existing_frontiers)) Log.note("Deleting annotations...") t.execute( "DELETE FROM annotations WHERE revision IN " + quote_set(csets_to_del)) Log.note("Deleting {{num_entries}} csetLog entries...", num_entries=len(csets_to_del)) t.execute("DELETE FROM csetLog WHERE revision IN " + quote_set(csets_to_del)) # Recalculate the revnums self.recompute_table_revnums() except Exception as e: Log.warning( "Unexpected error occured while deleting from csetLog:", cause=e) Till(seconds=CSET_DELETION_WAIT_TIME).wait() return def get_old_cset_revnum(self, revision): self.csets_todo_backwards.add((revision, True)) revnum = None timeout = Till(seconds=BACKFILL_REVNUM_TIMEOUT) while not timeout: with self.conn.transaction() as t: revnum = self._get_one_revnum(t, revision) if revnum and revnum[0] >= 0: break elif revnum[0] < 0: Log.note("Waiting for table to recompute...") else: Log.note("Waiting for backfill to complete...") Till(seconds=CSET_BACKFILL_WAIT_TIME).wait() if timeout: Log.error( "Cannot find revision {{rev}} after waiting {{timeout}} seconds", rev=revision, timeout=BACKFILL_REVNUM_TIMEOUT) return revnum def get_revnnums_from_range(self, revision1, revision2): with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) revnum2 = self._get_one_revnum(t, revision2) if not revnum1 or not revnum2: did_an_update = self.update_tip() if did_an_update: with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) revnum2 = self._get_one_revnum(t, revision2) if not revnum1: revnum1 = self.get_old_cset_revnum(revision1) # Refresh the second entry with self.conn.transaction() as t: revnum2 = self._get_one_revnum(t, revision2) if not revnum2: revnum2 = self.get_old_cset_revnum(revision2) # The first revnum might change also with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) with self.conn.transaction() as t: result = self._get_revnum_range(t, revnum1[0], revnum2[0]) return sorted(result, key=lambda x: int(x[0]))
class StructuredLogger_usingElasticSearch(StructuredLogger): @override def __init__(self, host, index, port=9200, type="log", max_size=1000, batch_size=100, kwargs=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ self.es = Cluster(kwargs).get_or_create_index( schema=mo_json.json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, kwargs=kwargs) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) self.es.settings.retry.sleep = Duration( coalesce(self.es.settings.retry.sleep, MINUTE)) Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): if params.get("template"): # DETECTED INNER TEMPLATE, ASSUME TRACE IS ON, SO DO NOT NEED THE OUTER TEMPLATE self.queue.add({"value": params}) else: template = strings.limit(template, 2000) self.queue.add({"value": { "template": template, "params": params }}, timeout=3 * MINUTE) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: Till(seconds=1).wait() messages = wrap(self.queue.pop_all()) if not messages: continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] try: for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() return scrubbed.append( _deep_json_to_string(message, depth=3)) finally: self.es.extend(scrubbed) bad_count = 0 except Exception as e: Log.warning("Problem inserting logs into ES", cause=e) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning( "Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index) Till(seconds=30).wait() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=1).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e) def stop(self): with suppress_exception: self.queue.add( THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT with suppress_exception: self.queue.close()
class ColumnList(Table, jx_base.Container): """ OPTIMIZED FOR THE PARTICULAR ACCESS PATTERNS USED """ def __init__(self, name): Table.__init__(self, "meta.columns") self.db_file = File("metadata." + name + ".sqlite") self.data = {} # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS) self.locker = Lock() self._schema = None self.db = sqlite3.connect( database=self.db_file.abspath, check_same_thread=False, isolation_level=None ) self.last_load = Null self.todo = Queue( "update columns to db" ) # HOLD (action, column) PAIR, WHERE action in ['insert', 'update'] self._db_load() Thread.run("update " + name, self._db_worker) @contextmanager def _db_transaction(self): self.db.execute(str("BEGIN")) try: yield self.db.execute(str("COMMIT")) except Exception as e: e = Except.wrap(e) self.db.execute(str("ROLLBACK")) Log.error("Transaction failed", cause=e) def _query(self, query): result = Data() curr = self.db.execute(query) result.meta.format = "table" result.header = [d[0] for d in curr.description] if curr.description else None result.data = curr.fetchall() return result def _db_create(self): with self._db_transaction(): self.db.execute( "CREATE TABLE " + db_table_name + sql_iso( sql_list( [ quote_column(c.name) + " " + json_type_to_sqlite_type[c.jx_type] for c in METADATA_COLUMNS ] + [ "PRIMARY KEY" + sql_iso( sql_list(map(quote_column, ["es_index", "es_column"])) ) ] ) ) ) for c in METADATA_COLUMNS: self._add(c) self._db_insert_column(c) def _db_load(self): self.last_load = Date.now() result = self._query( SQL_SELECT + "name" + SQL_FROM + "sqlite_master" + SQL_WHERE + SQL_AND.join(["name=" + db_table_name, "type=" + quote_value("table")]) ) if not result.data: self._db_create() return result = self._query( SQL_SELECT + all_columns + SQL_FROM + db_table_name + SQL_ORDERBY + sql_list(map(quote_column, ["es_index", "name", "es_column"])) ) with self.locker: for r in result.data: c = row_to_column(result.header, r) self._add(c) def _db_worker(self, please_stop): while not please_stop: try: with self._db_transaction(): result = self._query( SQL_SELECT + all_columns + SQL_FROM + db_table_name + SQL_WHERE + "last_updated > " + quote_value(self.last_load) + SQL_ORDERBY + sql_list(map(quote_column, ["es_index", "name", "es_column"])) ) with self.locker: for r in result.data: c = row_to_column(result.header, r) self._add(c) if c.last_updated > self.last_load: self.last_load = c.last_updated updates = self.todo.pop_all() DEBUG and updates and Log.note( "{{num}} columns to push to db", num=len(updates) ) for action, column in updates: while not please_stop: try: with self._db_transaction(): DEBUG and Log.note( "{{action}} db for {{table}}.{{column}}", action=action, table=column.es_index, column=column.es_column, ) if action is EXECUTE: self.db.execute(column) elif action is UPDATE: self.db.execute( "UPDATE" + db_table_name + "SET" + sql_list( [ "count=" + quote_value(column.count), "cardinality=" + quote_value(column.cardinality), "multi=" + quote_value(column.multi), "partitions=" + quote_value( value2json(column.partitions) ), "last_updated=" + quote_value(column.last_updated), ] ) + SQL_WHERE + SQL_AND.join( [ "es_index = " + quote_value(column.es_index), "es_column = " + quote_value(column.es_column), "last_updated < " + quote_value(column.last_updated), ] ) ) elif action is DELETE: self.db.execute( "DELETE FROM" + db_table_name + SQL_WHERE + SQL_AND.join( [ "es_index = " + quote_value(column.es_index), "es_column = " + quote_value(column.es_column), ] ) ) else: self._db_insert_column(column) break except Exception as e: e = Except.wrap(e) if "database is locked" in e: Log.note("metadata database is locked") Till(seconds=1).wait() break else: Log.warning("problem updataing database", cause=e) except Exception as e: Log.warning("problem updating database", cause=e) (Till(seconds=10) | please_stop).wait() def _db_insert_column(self, column): try: self.db.execute( "INSERT INTO" + db_table_name + sql_iso(all_columns) + "VALUES" + sql_iso( sql_list( [ quote_value(column[c.name]) if c.name not in ("nested_path", "partitions") else quote_value(value2json(column[c.name])) for c in METADATA_COLUMNS ] ) ) ) except Exception as e: e = Except.wrap(e) if "UNIQUE constraint failed" in e or " are not unique" in e: # THIS CAN HAPPEN BECAUSE todo HAS OLD COLUMN DATA self.todo.add((UPDATE, column), force=True) else: Log.error("do not know how to handle", cause=e) def __copy__(self): output = object.__new__(ColumnList) Table.__init__(output, "meta.columns") output.data = { t: {c: list(cs) for c, cs in dd.items()} for t, dd in self.data.items() } output.locker = Lock() output._schema = None return output def find(self, es_index, abs_column_name=None): with self.locker: if es_index.startswith("meta."): self._update_meta() if not abs_column_name: return [c for cs in self.data.get(es_index, {}).values() for c in cs] else: return self.data.get(es_index, {}).get(abs_column_name, []) def extend(self, columns): self.dirty = True with self.locker: for column in columns: self._add(column) def add(self, column): self.dirty = True with self.locker: canonical = self._add(column) if canonical == None: return column # ALREADY ADDED self.todo.add((INSERT if canonical is column else UPDATE, canonical)) return canonical def remove_table(self, table_name): del self.data[table_name] def _add(self, column): """ :param column: ANY COLUMN OBJECT :return: None IF column IS canonical ALREADY (NET-ZERO EFFECT) """ columns_for_table = self.data.setdefault(column.es_index, {}) existing_columns = columns_for_table.setdefault(column.name, []) for canonical in existing_columns: if canonical is column: return None if canonical.es_type == column.es_type: if column.last_updated > canonical.last_updated: for key in Column.__slots__: old_value = canonical[key] new_value = column[key] if new_value == None: pass # DO NOT BOTHER CLEARING OLD VALUES (LIKE cardinality AND paritiions) elif new_value == old_value: pass # NO NEED TO UPDATE WHEN NO CHANGE MADE (COMMON CASE) else: canonical[key] = new_value return canonical existing_columns.append(column) return column def _update_meta(self): if not self.dirty: return for mcl in self.data.get("meta.columns").values(): for mc in mcl: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.name] if value == None: pass else: count += 1 if is_list(value): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif is_data(value): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = Date.now() self.dirty = False def _all_columns(self): return [ column for t, cs in self.data.items() for _, css in cs.items() for column in css ] def __iter__(self): with self.locker: self._update_meta() return iter(self._all_columns()) def __len__(self): return self.data["meta.columns"]["es_index"].count def update(self, command): self.dirty = True try: command = wrap(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": with self.locker: del self.data[eq.es_index] self.todo.add( ( EXECUTE, "DELETE FROM " + db_table_name + SQL_WHERE + " es_index=" + quote_value(eq.es_index), ) ) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all( c[k] == v for k, v in eq.items() ) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": self.todo.add((DELETE, col)) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.todo.add((UPDATE, col)) except Exception as e: Log.error("should not happen", cause=e) def query(self, query): # NOT EXPECTED TO BE RUN Log.error("not") with self.locker: self._update_meta() if not self._schema: self._schema = Schema( ".", [c for cs in self.data["meta.columns"].values() for c in cs] ) snapshot = self._all_columns() from jx_python.containers.list_usingPythonList import ListContainer query.frum = ListContainer("meta.columns", snapshot, self._schema) return jx.run(query) def groupby(self, keys): with self.locker: self._update_meta() return jx.groupby(self.__iter__(), keys) @property def schema(self): if not self._schema: with self.locker: self._update_meta() self._schema = Schema( ".", [c for cs in self.data["meta.columns"].values() for c in cs] ) return self._schema @property def namespace(self): return self def get_table(self, table_name): if table_name != "meta.columns": Log.error("this container has only the meta.columns") return self def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ with self.locker: self._update_meta() output = [ { "table": c.es_index, "name": untype_path(c.name), "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "es_type": c.es_type, "type": c.jx_type, } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.jx_type not in STRUCT # and c.es_column != "_id" ] from jx_python.containers.list_usingPythonList import ListContainer return ListContainer( self.name, data=output, schema=jx_base.Schema("meta.columns", SIMPLE_METADATA_COLUMNS), )
class StructuredLogger_usingElasticSearch(StructuredLogger): @override def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] rollover_interval = coalesce(kwargs.rollover.interval, kwargs.rollover.max, "year") rollover_max = coalesce(kwargs.rollover.max, kwargs.rollover.interval, "year") schema = set_default( kwargs.schema, {"mappings": {kwargs.type: {"properties": {"~N~": {"type": "nested"}}}}}, json2value(value2json(SCHEMA), leaves=True) ) self.es = RolloverIndex( rollover_field={"get": [{"first": "."}, {"literal": "timestamp"}]}, rollover_interval=rollover_interval, rollover_max=rollover_max, schema=schema, limit_replicas=True, typed=True, read_only=False, kwargs=kwargs, ) self.batch_size = batch_size self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): try: params.template = strings.limit(params.template, 2000) params.format = None self.queue.add({"value": _deep_json_to_string(params, 3)}, timeout=3 * 60) except Exception as e: sys.stdout.write(text(Except.wrap(e))) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: messages = wrap(self.queue.pop_all()) if not messages: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() continue for g, mm in jx.chunk(messages, size=self.batch_size): scrubbed = [] for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() continue try: chain = flatten_causal_chain(message.value) scrubbed.append( { "value": [ _deep_json_to_string(link, depth=3) for link in chain ] } ) except Exception as e: Log.warning("Problem adding to scrubbed list", cause=e) self.es.extend(scrubbed) bad_count = 0 except Exception as f: Log.warning("Problem inserting logs into ES", cause=f) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning( "Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index, ) break Till(seconds=PAUSE_AFTER_BAD_INSERT).wait() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e) def stop(self): with suppress_exception: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT with suppress_exception: self.queue.close() self.worker.join()
class Clogger: # Singleton of the look-ahead scanner Clogger SINGLE_CLOGGER = None def __new__(cls, *args, **kwargs): if cls.SINGLE_CLOGGER is None: cls.SINGLE_CLOGGER = object.__new__(cls) return cls.SINGLE_CLOGGER def __init__(self, conn=None, tuid_service=None, start_workers=True, new_table=False, kwargs=None): try: self.config = kwargs self.conn = conn if conn else sql.Sql(self.config.database.name) self.hg_cache = HgMozillaOrg(kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService( kwargs=self.config.tuid, conn=self.conn, clogger=self ) self.rev_locker = Lock() self.working_locker = Lock() if new_table: with self.conn.transaction() as t: t.execute("DROP TABLE IF EXISTS csetLog") self.init_db() self.next_revnum = coalesce(self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1) self.csets_todo_backwards = Queue(name="Clogger.csets_todo_backwards") self.deletions_todo = Queue(name="Clogger.deletions_todo") self.maintenance_signal = Signal(name="Clogger.maintenance_signal") if 'tuid' in self.config: self.config = self.config.tuid self.disable_backfilling = False self.disable_tipfilling = False self.disable_deletion = False self.disable_maintenance = False self.backfill_thread = None self.tipfill_thread = None self.deletion_thread = None self.maintenance_thread = None # Make sure we are filled before allowing queries numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] if numrevs < MINIMUM_PERMANENT_CSETS: Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS) oldest_rev = 'tip' with self.conn.transaction() as t: tmp = t.query("SELECT min(revnum), revision FROM csetLog").data[0][1] if tmp: oldest_rev = tmp self._fill_in_range( MINIMUM_PERMANENT_CSETS - numrevs, oldest_rev, timestamp=False ) Log.note( "Table is filled with atleast {{minim}} entries.", minim=MINIMUM_PERMANENT_CSETS ) if start_workers: self.start_workers() except Exception as e: Log.warning("Cannot setup clogger: {{cause}}", cause=str(e)) def start_backfilling(self): if not self.backfill_thread: self.backfill_thread = Thread.run('clogger-backfill', self.fill_backward_with_list) def start_tipfillling(self): if not self.tipfill_thread: self.tipfill_thread = Thread.run('clogger-tip', self.fill_forward_continuous) def start_maintenance(self): if not self.maintenance_thread: self.maintenance_thread = Thread.run('clogger-maintenance', self.csetLog_maintenance) def start_deleter(self): if not self.deletion_thread: self.deletion_thread = Thread.run('clogger-deleter', self.csetLog_deleter) def start_workers(self): self.start_tipfillling() self.start_backfilling() self.start_maintenance() self.start_deleter() Log.note("Started clogger workers.") def init_db(self): with self.conn.transaction() as t: t.execute(''' CREATE TABLE IF NOT EXISTS csetLog ( revnum INTEGER PRIMARY KEY, revision CHAR(12) NOT NULL, timestamp INTEGER );''') def disable_all(self): self.disable_tipfilling = True self.disable_backfilling = True self.disable_maintenance = True self.disable_deletion = True def revnum(self): """ :return: max revnum that was added """ return coalesce(self.conn.get_one("SELECT max(revnum) as revnum FROM csetLog")[0], 0) def get_tip(self, transaction): return transaction.get_one( "SELECT max(revnum) as revnum, revision FROM csetLog" ) def get_tail(self, transaction): return transaction.get_one( "SELECT min(revnum) as revnum, revision FROM csetLog" ) def _get_clog(self, clog_url): try: Log.note("Searching through changelog {{url}}", url=clog_url) clog_obj = http.get_json(clog_url, retry=RETRY) return clog_obj except Exception as e: Log.error( "Unexpected error getting changset-log for {{url}}: {{error}}", url=clog_url, error=e ) def _get_one_revision(self, transaction, cset_entry): # Returns a single revision if it exists _, rev, _ = cset_entry return transaction.get_one("SELECT revision FROM csetLog WHERE revision=?", (rev,)) def _get_one_revnum(self, transaction, rev): # Returns a single revnum if it exists return transaction.get_one("SELECT revnum FROM csetLog WHERE revision=?", (rev,)) def _get_revnum_range(self, transaction, revnum1, revnum2): # Returns a range of revision numbers (that is inclusive) high_num = max(revnum1, revnum2) low_num = min(revnum1, revnum2) return transaction.query( "SELECT revnum, revision FROM csetLog WHERE " "revnum >= " + str(low_num) + " AND revnum <= " + str(high_num) ).data def recompute_table_revnums(self): ''' Recomputes the revnums for the csetLog table by creating a new table, and copying csetLog to it. The INTEGER PRIMARY KEY in the temp table auto increments as rows are added. IMPORTANT: Only call this after acquiring the lock `self.working_locker`. :return: ''' with self.conn.transaction() as t: t.execute(''' CREATE TABLE temp ( revnum INTEGER PRIMARY KEY, revision CHAR(12) NOT NULL, timestamp INTEGER );''') t.execute( "INSERT INTO temp (revision, timestamp) " "SELECT revision, timestamp FROM csetlog ORDER BY revnum ASC" ) t.execute("DROP TABLE csetLog;") t.execute("ALTER TABLE temp RENAME TO csetLog;") def check_for_maintenance(self): ''' Returns True if the maintenance worker should be run now, and False otherwise. :return: ''' numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] Log.note("Number of csets in csetLog table: {{num}}", num=numrevs) if numrevs >= SIGNAL_MAINTENANCE_CSETS: return True return False def add_cset_entries(self, ordered_rev_list, timestamp=False, number_forward=True): ''' Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered based on how changesets are found in the changelog. Going forwards or backwards is dealt with by flipping the list :param ordered_cset_list: Order given from changeset log searching. :param timestamp: If false, records are kept indefinitely but if holes exist: (delete, None, delete, None) those delete's with None's around them will not be deleted. :param numbered: If True, this function will number the revision list by going forward from max(revNum), else it'll go backwards from revNum, then add X to all revnums and self.next_revnum where X is the length of ordered_rev_list :return: ''' with self.conn.transaction() as t: current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0] current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0] if not current_min or not current_max: current_min = 0 current_max = 0 direction = -1 start = current_min - 1 if number_forward: direction = 1 start = current_max + 1 ordered_rev_list = ordered_rev_list[::-1] insert_list = [ ( start + direction * count, rev, int(time.time()) if timestamp else -1 ) for count, rev in enumerate(ordered_rev_list) ] # In case of overlapping requests fmt_insert_list = [] for cset_entry in insert_list: tmp = self._get_one_revision(t, cset_entry) if not tmp: fmt_insert_list.append(cset_entry) for _, tmp_insert_list in jx.groupby(fmt_insert_list, size=SQL_CSET_BATCH_SIZE): t.execute( "INSERT INTO csetLog (revnum, revision, timestamp)" + " VALUES " + sql_list( quote_set((revnum, revision, timestamp)) for revnum, revision, timestamp in tmp_insert_list ) ) # Move the revision numbers forward if needed self.recompute_table_revnums() # Start a maintenance run if needed if self.check_for_maintenance(): Log.note("Scheduling maintenance run on clogger.") self.maintenance_signal.go() def _fill_in_range(self, parent_cset, child_cset, timestamp=False, number_forward=True): ''' Fills cset logs in a certain range. 'parent_cset' can be an int and in that case, we get that many changesets instead. If parent_cset is an int, then we consider that we are going backwards (number_forward is False) and we ignore the first changeset of the first log, and we ignore the setting for number_forward. Otherwise, we continue until we find the given 'parent_cset'. :param parent_cset: :param child_cset: :param timestamp: :param number_forward: :return: ''' csets_to_add = [] found_parent = False find_parent = False if type(parent_cset) != int: find_parent = True elif parent_cset >= MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG: Log.warning( "Requested number of new changesets {{num}} is too high. " "Max number that can be requested is {{maxnum}}.", num=parent_cset, maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG ) return None csets_found = 0 clogs_seen = 0 final_rev = child_cset while not found_parent and clogs_seen < MAX_BACKFILL_CLOGS: clog_url = str(HG_URL) + "/" + self.config.hg.branch + "/json-log/" + final_rev clog_obj = self._get_clog(clog_url) clog_csets_list = list(clog_obj['changesets']) for clog_cset in clog_csets_list[:-1]: if not number_forward and csets_found <= 0: # Skip this entry it already exists csets_found += 1 continue nodes_cset = clog_cset['node'][:12] if find_parent: if nodes_cset == parent_cset: found_parent = True if not number_forward: # When going forward this entry is # the given parent csets_to_add.append(nodes_cset) break else: if csets_found + 1 > parent_cset: found_parent = True if not number_forward: # When going forward this entry is # the given parent (which is supposed # to already exist) csets_to_add.append(nodes_cset) break csets_found += 1 csets_to_add.append(nodes_cset) if found_parent == True: break clogs_seen += 1 final_rev = clog_csets_list[-1]['node'][:12] if found_parent: self.add_cset_entries(csets_to_add, timestamp=timestamp, number_forward=number_forward) else: Log.warning( "Couldn't find the end of the request for {{request}}. " "Max number that can be requested through _fill_in_range is {{maxnum}}.", request={ 'parent_cset': parent_cset, 'child_cset':child_cset, 'number_forward': number_forward }, maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG ) return None return csets_to_add def initialize_to_range(self, old_rev, new_rev, delete_old=True): ''' Used in service testing to get to very old changesets quickly. :param old_rev: The oldest revision to keep :param new_rev: The revision to start searching from :return: ''' old_settings = [ self.disable_tipfilling, self.disable_backfilling, self.disable_maintenance, self.disable_deletion ] self.disable_tipfilling = True self.disable_backfilling = True self.disable_maintenance = True self.disable_deletion = True old_rev = old_rev[:12] new_rev = new_rev[:12] with self.working_locker: if delete_old: with self.conn.transaction() as t: t.execute("DELETE FROM csetLog") with self.conn.transaction() as t: t.execute( "INSERT INTO csetLog (revision, timestamp) VALUES " + quote_set((new_rev, -1)) ) self._fill_in_range(old_rev, new_rev, timestamp=True, number_forward=False) self.disable_tipfilling = old_settings[0] self.disable_backfilling = old_settings[1] self.disable_maintenance = old_settings[2] self.disable_deletion = old_settings[3] def fill_backward_with_list(self, please_stop=None): ''' Expects requests of the tuple form: (parent_cset, timestamp) parent_cset can be an int X to go back by X changesets, or a string to search for going backwards in time. If timestamp is false, no timestamps will be added to the entries. :param please_stop: :return: ''' while not please_stop: try: request = self.csets_todo_backwards.pop(till=please_stop) if please_stop: break # If backfilling is disabled, all requests # are ignored. if self.disable_backfilling: Till(till=CSET_BACKFILL_WAIT_TIME).wait() continue if request: parent_cset, timestamp = request else: continue with self.working_locker: with self.conn.transaction() as t: parent_revnum = self._get_one_revnum(t, parent_cset) if parent_revnum: continue with self.conn.transaction() as t: _, oldest_revision = self.get_tail(t) self._fill_in_range( parent_cset, oldest_revision, timestamp=timestamp, number_forward=False ) Log.note("Finished {{cset}}", cset=parent_cset) except Exception as e: Log.warning("Unknown error occurred during backfill: ", cause=e) def update_tip(self): ''' Returns False if the tip is already at the newest, or True if an update has taken place. :return: ''' clog_obj = self._get_clog( str(HG_URL) + "/" + self.config.hg.branch + "/json-log/tip" ) # Get current tip in DB with self.conn.transaction() as t: _, newest_known_rev = self.get_tip(t) # If we are still at the newest, wait for CSET_TIP_WAIT_TIME seconds # before checking again. first_clog_entry = clog_obj['changesets'][0]['node'][:12] if newest_known_rev == first_clog_entry: return False csets_to_gather = None if not newest_known_rev: Log.note( "No revisions found in table, adding {{minim}} entries...", minim=MINIMUM_PERMANENT_CSETS ) csets_to_gather = MINIMUM_PERMANENT_CSETS found_newest_known = False csets_to_add = [] csets_found = 0 clogs_seen = 0 Log.note("Found new revisions. Updating csetLog tip to {{rev}}...", rev=first_clog_entry) while not found_newest_known and clogs_seen < MAX_TIPFILL_CLOGS: clog_csets_list = list(clog_obj['changesets']) for clog_cset in clog_csets_list[:-1]: nodes_cset = clog_cset['node'][:12] if not csets_to_gather: if nodes_cset == newest_known_rev: found_newest_known = True break else: if csets_found >= csets_to_gather: found_newest_known = True break csets_found += 1 csets_to_add.append(nodes_cset) if not found_newest_known: # Get the next page clogs_seen += 1 final_rev = clog_csets_list[-1]['node'][:12] clog_url = str(HG_URL) + "/" + self.config.hg.branch + "/json-log/" + final_rev clog_obj = self._get_clog(clog_url) if clogs_seen >= MAX_TIPFILL_CLOGS: Log.error( "Too many changesets, can't find last tip or the number is too high: {{rev}}. " "Maximum possible to request is {{maxnum}}", rev=coalesce(newest_known_rev, csets_to_gather), maxnum=MAX_TIPFILL_CLOGS * CHANGESETS_PER_CLOG ) return False with self.working_locker: Log.note("Adding {{csets}}", csets=csets_to_add) self.add_cset_entries(csets_to_add, timestamp=False) return True def fill_forward_continuous(self, please_stop=None): while not please_stop: try: while not please_stop and not self.disable_tipfilling and self.update_tip(): pass (please_stop | Till(seconds=CSET_TIP_WAIT_TIME)).wait() except Exception as e: Log.warning("Unknown error occurred during tip filling:", cause=e) def csetLog_maintenance(self, please_stop=None): ''' Handles deleting old csetLog entries and timestamping revisions once they pass the length for permanent storage for deletion later. :param please_stop: :return: ''' while not please_stop: try: # Wait until something signals the maintenance cycle # to begin (or end). (self.maintenance_signal | please_stop).wait() if please_stop: break if self.disable_maintenance: continue Log.warning( "Starting clog maintenance. Since this doesn't start often, " "we need to explicitly see when it's started with this warning." ) # Reset signal so we don't request # maintenance infinitely. with self.maintenance_signal.lock: self.maintenance_signal._go = False with self.working_locker: all_data = None with self.conn.transaction() as t: all_data = sorted( t.get("SELECT revnum, revision, timestamp FROM csetLog"), key=lambda x: int(x[0]) ) # Restore maximum permanents (if overflowing) new_data = [] modified = False for count, (revnum, revision, timestamp) in enumerate(all_data[::-1]): if count < MINIMUM_PERMANENT_CSETS: if timestamp != -1: modified = True new_data.append((revnum, revision, -1)) else: new_data.append((revnum, revision, timestamp)) elif type(timestamp) != int or timestamp == -1: modified = True new_data.append((revnum, revision, int(time.time()))) else: new_data.append((revnum, revision, timestamp)) # Delete annotations at revisions with timestamps # that are too old. The csetLog entries will have # their timestamps reset here. new_data1 = [] annrevs_to_del = [] current_time = time.time() for count, (revnum, revision, timestamp) in enumerate(new_data[::-1]): new_timestamp = timestamp if timestamp != -1: if current_time >= timestamp + TIME_TO_KEEP_ANNOTATIONS.seconds: modified = True new_timestamp = current_time annrevs_to_del.append(revision) new_data1.append((revnum, revision, new_timestamp)) if len(annrevs_to_del) > 0: # Delete any latestFileMod and annotation entries # that are too old. Log.note( "Deleting annotations and latestFileMod for revisions for being " "older than {{oldest}}: {{revisions}}", oldest=TIME_TO_KEEP_ANNOTATIONS, revisions=annrevs_to_del ) with self.conn.transaction() as t: t.execute( "DELETE FROM latestFileMod WHERE revision IN " + quote_set(annrevs_to_del) ) t.execute( "DELETE FROM annotations WHERE revision IN " + quote_set(annrevs_to_del) ) # Delete any overflowing entries new_data2 = new_data1 reved_all_data = all_data[::-1] deleted_data = reved_all_data[MAXIMUM_NONPERMANENT_CSETS:] delete_overflowing_revstart = None if len(deleted_data) > 0: _, delete_overflowing_revstart, _ = deleted_data[0] new_data2 = set(all_data) - set(deleted_data) # Update old frontiers if requested, otherwise # they will all get deleted by the csetLog_deleter # worker if UPDATE_VERY_OLD_FRONTIERS: _, max_revision, _ = all_data[-1] for _, revision, _ in deleted_data: with self.conn.transaction() as t: old_files = t.get( "SELECT file FROM latestFileMod WHERE revision=?", (revision,) ) if old_files is None or len(old_files) <= 0: continue self.tuid_service.get_tuids_from_files( old_files, max_revision, going_forward=True, ) still_exist = True while still_exist and not please_stop: Till(seconds=TUID_EXISTENCE_WAIT_TIME).wait() with self.conn.transaction() as t: old_files = t.get( "SELECT file FROM latestFileMod WHERE revision=?", (revision,) ) if old_files is None or len(old_files) <= 0: still_exist = False # Update table and schedule a deletion if modified: with self.conn.transaction() as t: insert_into_db_chunked( t, new_data2, "INSERT OR REPLACE INTO csetLog (revnum, revision, timestamp) VALUES " ) if not deleted_data: continue Log.note("Scheduling {{num_csets}} for deletion", num_csets=len(deleted_data)) self.deletions_todo.add(delete_overflowing_revstart) except Exception as e: Log.warning("Unexpected error occured while maintaining csetLog, continuing to try: ", cause=e) return def csetLog_deleter(self, please_stop=None): ''' Deletes changesets from the csetLog table and also changesets from the annotation table that have revisions matching the given changesets. Accepts lists of csets from self.deletions_todo. :param please_stop: :return: ''' while not please_stop: try: request = self.deletions_todo.pop(till=please_stop) if please_stop: break # If deletion is disabled, ignore the current # request - it will need to be re-requested. if self.disable_deletion: Till(till=CSET_DELETION_WAIT_TIME).wait() continue with self.working_locker: first_cset = request # Since we are deleting and moving stuff around in the # TUID tables, we need everything to be contained in # one transaction with no interruptions. with self.conn.transaction() as t: revnum = self._get_one_revnum(t, first_cset)[0] csets_to_del = t.get( "SELECT revnum, revision FROM csetLog WHERE revnum <= ?", (revnum,) ) csets_to_del = [cset for _, cset in csets_to_del] existing_frontiers = t.query( "SELECT revision FROM latestFileMod WHERE revision IN " + quote_set(csets_to_del) ).data existing_frontiers = [existing_frontiers[i][0] for i, _ in enumerate(existing_frontiers)] Log.note( "Deleting all annotations and changeset log entries with revisions in the list: {{csets}}", csets=csets_to_del ) if len(existing_frontiers) > 0: # This handles files which no longer exist anymore in # the main branch. Log.note( "Deleting existing frontiers for revisions: {{revisions}}", revisions=existing_frontiers ) t.execute( "DELETE FROM latestFileMod WHERE revision IN " + quote_set(existing_frontiers) ) Log.note("Deleting annotations...") t.execute( "DELETE FROM annotations WHERE revision IN " + quote_set(csets_to_del) ) Log.note( "Deleting {{num_entries}} csetLog entries...", num_entries=len(csets_to_del) ) t.execute( "DELETE FROM csetLog WHERE revision IN " + quote_set(csets_to_del) ) # Recalculate the revnums self.recompute_table_revnums() except Exception as e: Log.warning("Unexpected error occured while deleting from csetLog:", cause=e) Till(seconds=CSET_DELETION_WAIT_TIME).wait() return def get_old_cset_revnum(self, revision): self.csets_todo_backwards.add((revision, True)) revnum = None timeout = Till(seconds=BACKFILL_REVNUM_TIMEOUT) while not timeout: with self.conn.transaction() as t: revnum = self._get_one_revnum(t, revision) if revnum and revnum[0] >= 0: break elif revnum[0] < 0: Log.note("Waiting for table to recompute...") else: Log.note("Waiting for backfill to complete...") Till(seconds=CSET_BACKFILL_WAIT_TIME).wait() if timeout: Log.error( "Cannot find revision {{rev}} after waiting {{timeout}} seconds", rev=revision, timeout=BACKFILL_REVNUM_TIMEOUT ) return revnum def get_revnnums_from_range(self, revision1, revision2): with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) revnum2 = self._get_one_revnum(t, revision2) if not revnum1 or not revnum2: did_an_update = self.update_tip() if did_an_update: with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) revnum2 = self._get_one_revnum(t, revision2) if not revnum1: revnum1 = self.get_old_cset_revnum(revision1) # Refresh the second entry with self.conn.transaction() as t: revnum2 = self._get_one_revnum(t, revision2) if not revnum2: revnum2 = self.get_old_cset_revnum(revision2) # The first revnum might change also with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) with self.conn.transaction() as t: result = self._get_revnum_range(t, revnum1[0], revnum2[0]) return sorted( result, key=lambda x: int(x[0]) )
class StructuredLogger_usingElasticSearch(StructuredLogger): @override def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds self.es = Cluster(kwargs).get_or_create_index( schema=json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, typed=True, kwargs=kwargs, ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): try: params.template = strings.limit(params.template, 2000) params.format = None self.queue.add({"value": _deep_json_to_string(params, 3)}, timeout=3 * 60) except Exception as e: sys.stdout.write(text_type(Except.wrap(e))) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: messages = wrap(self.queue.pop_all()) if not messages: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() continue try: messages = flatten_causal_chain(message.value) scrubbed.append( {"value": [_deep_json_to_string(m, depth=3) for m in messages]} ) except Exception as e: Log.warning("Problem adding to scrubbed list", cause=e) self.es.extend(scrubbed) bad_count = 0 except Exception as f: Log.warning("Problem inserting logs into ES", cause=f) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning( "Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index, ) Till(seconds=PAUSE_AFTER_BAD_INSERT).wait() self.es.flush() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e) def stop(self): with suppress_exception: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT with suppress_exception: self.queue.close() self.worker.join()
class ColumnList(jx_base.Table, jx_base.Container): """ OPTIMIZED FOR fact column LOOKUP """ def __new__(cls, db): output = CACHE.get(id(db)) if not output: output = CACHE[id(db)] = object.__new__(cls) return output def __init__(self, db): Table.__init__(self, META_COLUMNS_NAME) self.data = {} # MAP FROM fact_name TO (abs_column_name to COLUMNS) self.locker = Lock() self._schema = None self.dirty = False self.db = db self.es_index = None self.last_load = Null self.todo = Queue( "update columns to es" ) # HOLD (action, column) PAIR, WHERE action in ['insert', 'update'] self._snowflakes = Data() self._load_from_database() def _query(self, query): result = Data() curr = self.es_cluster.execute(query) result.meta.format = "table" result.header = [d[0] for d in curr.description] if curr.description else None result.data = curr.fetchall() return result def _load_from_database(self): # FIND ALL TABLES result = self.db.query(sql_query({ "from": "sqlite_master", "where": {"eq": {"type": "table"}}, "orderby": "name" })) tables = wrap([{k: d for k, d in zip(result.header, row)} for row in result.data]) last_nested_path = ["."] for table in tables: if table.name.startswith("__"): continue base_table, nested_path = tail_field(table.name) # FIND COMMON NESTED PATH SUFFIX if nested_path == ".": last_nested_path = [] else: for i, p in enumerate(last_nested_path): if startswith_field(nested_path, p): last_nested_path = last_nested_path[i:] break else: last_nested_path = [] full_nested_path = [nested_path] + last_nested_path self._snowflakes[literal_field(base_table)] += [full_nested_path] # LOAD THE COLUMNS details = self.db.about(table.name) for cid, name, dtype, notnull, dfft_value, pk in details: if name.startswith("__"): continue cname, ctype = untyped_column(name) self.add(Column( name=cname, jx_type=coalesce(sql_type_to_json_type.get(ctype), IS_NULL), nested_path=full_nested_path, es_type=dtype, es_column=name, es_index=table.name, last_updated=Date.now() )) last_nested_path = full_nested_path def find(self, es_index, abs_column_name=None): with self.locker: if es_index.startswith("meta."): self._update_meta() if not abs_column_name: return [c for cs in self.data.get(es_index, {}).values() for c in cs] else: return self.data.get(es_index, {}).get(abs_column_name, []) def extend(self, columns): self.dirty = True with self.locker: for column in columns: self._add(column) def add(self, column): self.dirty = True with self.locker: canonical = self._add(column) if canonical == None: return column # ALREADY ADDED self.todo.add(canonical) return canonical def remove(self, column): self.dirty = True with self.locker: canonical = self._remove(column) def remove_table(self, table_name): del self.data[table_name] def _add(self, column): """ :param column: ANY COLUMN OBJECT :return: None IF column IS canonical ALREADY (NET-ZERO EFFECT) """ columns_for_table = self.data.setdefault(column.es_index, {}) existing_columns = columns_for_table.setdefault(column.name, []) for canonical in existing_columns: if canonical is column: return None if canonical.es_type == column.es_type: if column.last_updated > canonical.last_updated: for key in Column.__slots__: old_value = canonical[key] new_value = column[key] if new_value == None: pass # DO NOT BOTHER CLEARING OLD VALUES (LIKE cardinality AND paritiions) elif new_value == old_value: pass # NO NEED TO UPDATE WHEN NO CHANGE MADE (COMMON CASE) else: canonical[key] = new_value return canonical existing_columns.append(column) return column def _remove(self, column): """ :param column: ANY COLUMN OBJECT """ columns_for_table = self.data.setdefault(column.es_index, {}) existing_columns = columns_for_table.setdefault(column.name, []) for i, canonical in enumerate(existing_columns): if canonical is column: del existing_columns[i] return def _update_meta(self): if not self.dirty: return now = Date.now() for mc in META_COLUMNS_DESC.columns: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.name] if value == None: pass else: count += 1 if is_list(value): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif is_data(value): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = now META_COLUMNS_DESC.last_updated = now self.dirty = False def _all_columns(self): return [ column for t, cs in self.data.items() for _, css in cs.items() for column in css ] def __iter__(self): with self.locker: self._update_meta() return iter(self._all_columns()) def __len__(self): return self.data[META_COLUMNS_NAME]["es_index"].count def update(self, command): self.dirty = True try: command = wrap(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": d = self.data i = eq.es_index with self.locker: cols = d[i] del d[i] for c in cols: mark_as_deleted(c) self.todo.add(c) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all( c[k] == v for k, v in eq.items() ) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": mark_as_deleted(col) self.todo.add(col) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.todo.add(col) except Exception as e: Log.error("should not happen", cause=e) def query(self, query): # NOT EXPECTED TO BE RUN Log.error("not") with self.locker: self._update_meta() if not self._schema: self._schema = Schema( ".", [c for cs in self.data[META_COLUMNS_NAME].values() for c in cs] ) snapshot = self._all_columns() from jx_python.containers.list_usingPythonList import ListContainer query.frum = ListContainer(META_COLUMNS_NAME, snapshot, self._schema) return jx.run(query) def groupby(self, keys): with self.locker: self._update_meta() return jx.groupby(self.__iter__(), keys) def window(self, window): raise NotImplemented() @property def schema(self): if not self._schema: with self.locker: self._update_meta() self._schema = Schema( ".", [c for cs in self.data[META_COLUMNS_NAME].values() for c in cs] ) return self._schema @property def namespace(self): return self def get_table(self, table_name): if table_name != META_COLUMNS_NAME: Log.error("this container has only the " + META_COLUMNS_NAME) return self def get_columns(self, table_name): if table_name != META_COLUMNS_NAME: Log.error("this container has only the " + META_COLUMNS_NAME) return self._all_columns() def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ with self.locker: self._update_meta() output = [ { "table": c.es_index, "name": untyped_column(c.name)[0], "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "es_type": c.es_type, "type": c.jx_type, } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.jx_type not in STRUCT # and c.es_column != "_id" ] from jx_python.containers.list_usingPythonList import ListContainer return ListContainer( self.name, data=output, schema=jx_base.Schema(META_COLUMNS_NAME, SIMPLE_METADATA_COLUMNS), )
class StructuredLogger_usingElasticSearch(StructuredLogger): @override def __init__(self, host, index, port=9200, type="log", max_size=1000, batch_size=100, kwargs=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ self.es = Cluster(kwargs).get_or_create_index( schema=mo_json.json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, kwargs=kwargs ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE)) Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): if params.get("template"): # DETECTED INNER TEMPLATE, ASSUME TRACE IS ON, SO DO NOT NEED THE OUTER TEMPLATE self.queue.add({"value": params}) else: template = strings.limit(template, 2000) self.queue.add({"value": {"template": template, "params": params}}, timeout=3 * MINUTE) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: Till(seconds=1).wait() messages = wrap(self.queue.pop_all()) if not messages: continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] try: for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() return scrubbed.append(_deep_json_to_string(message, depth=3)) finally: self.es.extend(scrubbed) bad_count = 0 except Exception as e: Log.warning("Problem inserting logs into ES", cause=e) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index) Till(seconds=30).wait() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=1).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e) def stop(self): with suppress_exception: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT with suppress_exception: self.queue.close()
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ def __new__(cls, *args, **kwargs): if jx_base_meta.singlton: return jx_base_meta.singlton else: jx_base_meta.singlton = object.__new__(cls) return jx_base_meta.singlton @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = Relation_usingList() self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList() self.alias_to_query_paths = { "meta.columns": [['.']], "meta.tables": [['.']] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer("meta.tables", [], jx_base.Schema(".", table_columns)) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def url(self): return self.es_cluster.path + "/" + self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata(force=update_required) indexes = self.index_to_alias.get_domain(alias) props = [(self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if i in indexes for t, m in [_get_best_type_from_mapping(d.mappings)]] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list( jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata( force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(alias, mapping, meta) table_desc.timestamp = es_last_updated def _parse_properties(self, alias, mapping, meta): abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(SELF_PATH) query_paths.append(ROOT_PATH) self.alias_to_query_paths[alias] = query_paths # ADD RELATIVE NAMES for abs_column in abs_columns: abs_column.last_updated = None abs_column.jx_type = es_type_to_json_type[abs_column.es_type] for query_path in query_paths: abs_column.names[query_path[0]] = relative_field( abs_column.names["."], query_path[0]) self.todo.add(self.meta.columns.add(abs_column)) pass def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__()))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias[name] def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: last_update = MAX([ self.es_cluster.index_last_updated[i] for i in self.index_to_alias.get_domain(alias) ]) table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc(name=alias, url=None, query_path=['.'], timestamp=Date.MIN) with self.meta.tables.locker: self.meta.tables.add(table) self._reload_columns(table) elif force or table.timestamp < last_update: self._reload_columns(table) columns = self.meta.columns.find(alias, column_name) columns = jx.sort(columns, "names.\\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: if len(columns) > 10: Log.note("waiting for {{num}} columns to update", num=len([ c for c in columns if not c.last_updated ])) else: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = 1001 multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 multi = 1 else: result = self.es_cluster.post( "/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "size": 0 }) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{field}} has {{num}} parts", field=column.es_index, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = any( column.es_index.startswith(t) for t in [TEST_TABLE_PREFIX, TEST_TABLE]) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "names.\\.": ".", "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now() - TOO_OLD) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[ Date(t).format() for t in wrap(old_columns).last_updated ]) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds)) if column: if column is THREAD_STOP: continue DEBUG and Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column.es_index in self.index_does_not_exist: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) continue if column.jx_type in STRUCT or column.es_column.endswith( "." + EXISTS_TYPE): column.last_updated = Date.now() continue elif column.last_updated >= Date.now() - TOO_OLD: continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX) ) and Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if c.last_updated >= Date.now() - TOO_OLD: continue self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) DEBUG and Log.note( "Did not get {{col.es_index}}.{{col.es_column}} info", col=c) def get_table(self, alias_name): with self.meta.tables.locker: return wrap( [t for t in self.meta.tables.data if t.name == alias_name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema query_path = split_field(name) root, rest = query_path[0], join_field(query_path[1:]) return self.get_snowflake(root).get_schema(rest)
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ @override def __new__(cls, kwargs, *args, **_kwargs): es_cluster = elasticsearch.Cluster(kwargs) output = known_clusters.get(id(es_cluster)) if output is None: output = object.__new__(cls) known_clusters[id(es_cluster)] = output return output @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = Relation_usingList() self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList() self.alias_to_query_paths = { "meta.columns": [['.']], "meta.tables": [['.']] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [ # TableDesc("meta.columns", None, ".", Date.now()), # TableDesc("meta.tables", None, ".", Date.now()) ], jx_base.Schema(".", table_columns) ) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def namespace(self): return self.meta.columns.namespace @property def url(self): return self.es_cluster.url / self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata(force=update_required) indexes = self.index_to_alias.get_domain(alias) props = [ (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if i in indexes for t, m in [_get_best_type_from_mapping(d.mappings)] ] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(alias, mapping, meta) table_desc.timestamp = es_last_updated def _parse_properties(self, alias, mapping, meta): abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) if any(c.cardinality == 0 and c.names['.'] != '_id' for c in abs_columns): Log.warning( "Some columns are not stored {{names}}", names=[ ".".join((c.es_index, c.names['.'])) for c in abs_columns if c.cardinality == 0 ] ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(SELF_PATH) query_paths.append(ROOT_PATH) self.alias_to_query_paths[alias] = query_paths for i in self.index_to_alias.get_domain(alias): self.alias_to_query_paths[i] = query_paths # ADD RELATIVE NAMES for abs_column in abs_columns: abs_column.last_updated = None abs_column.jx_type = jx_type(abs_column) for query_path in query_paths: abs_column.names[query_path[0]] = relative_field(abs_column.names["."], query_path[0]) self.todo.add(self.meta.columns.add(abs_column)) pass def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias[name] def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: last_update = MAX([ self.es_cluster.index_last_updated[i] for i in self.index_to_alias.get_domain(alias) ]) table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc( name=alias, url=None, query_path=['.'], timestamp=Date.MIN ) with self.meta.tables.locker: self.meta.tables.add(table) self._reload_columns(table) elif force or table.timestamp < last_update: self._reload_columns(table) columns = self.meta.columns.find(alias, column_name) columns = jx.sort(columns, "names.\\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: if len(columns) > 10: Log.note("waiting for {{num}} columns to update", num=len([c for c in columns if not c.last_updated])) else: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 multi = 1 else: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}} }, "size": 0 }) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[Date(t).format() for t in wrap(old_columns).last_updated] ) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG): if column.es_index in self.index_does_not_exist: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) continue if column.jx_type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE): column.last_updated = Date.now() continue elif column.last_updated >= Date.now()-TOO_OLD: continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) else: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if c.last_updated >= Date.now()-TOO_OLD: continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": c}, silent=not DEBUG, too_long=0.05): self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) def get_table(self, name): if name == "meta.columns": return self.meta.columns # return self.meta.columns with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema query_path = split_field(name) root, rest = query_path[0], join_field(query_path[1:]) return self.get_snowflake(root).get_schema(rest)
class StructuredLogger_usingElasticSearch(StructuredLogger): @override def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] schema = json2value(value2json(SCHEMA), leaves=True) schema.mappings[type].properties["~N~"].type = "nested" self.es = Cluster(kwargs).get_or_create_index( schema=schema, limit_replicas=True, typed=True, kwargs=kwargs, ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): try: params.template = strings.limit(params.template, 2000) params.format = None self.queue.add({"value": _deep_json_to_string(params, 3)}, timeout=3 * 60) except Exception as e: sys.stdout.write(text_type(Except.wrap(e))) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: messages = wrap(self.queue.pop_all()) if not messages: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() continue try: messages = flatten_causal_chain(message.value) scrubbed.append( { "value": [ _deep_json_to_string(m, depth=3) for m in messages ] } ) except Exception as e: Log.warning("Problem adding to scrubbed list", cause=e) self.es.extend(scrubbed) bad_count = 0 except Exception as f: Log.warning("Problem inserting logs into ES", cause=f) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning( "Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index, ) Till(seconds=PAUSE_AFTER_BAD_INSERT).wait() self.es.flush() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e) def stop(self): with suppress_exception: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT with suppress_exception: self.queue.close() self.worker.join()
class Celery(object): _fixups = None _pool = None def __init__( self, name, broker=None, include=None, **kwargs ): self.Task = MethodCaller self.name = name self.request_queue = Queue(name=name+" requests") self.response_queue = Queue(name=name+" responses") self.kwargs = kwargs self.include = include self.broker = broker self._config = {} self._tasks = {} self.on_init() self.response_worker = Thread.run("response worker", self._response_worker) self.responses = {} self.responses_lock = Lock() self.id_lock = Lock() self.next_id = 1 self.worker = Worker(self.request_queue, self.response_queue, celery=self) def _response_worker(self, please_stop): while not please_stop: try: encoded_mail = self.response_queue.pop(till=please_stop) mail = json2value(encoded_mail) Log.note("got response for {{id}}", id=mail.request.id) with self.responses_lock: try: async_response = self.responses[mail.request.id] async_response.mail = set_default(mail, async_response.mail) if mail.status in states.READY_STATES: async_response._ready.go() except Exception as e: Log.warning("not expected", cause=e) except Exception as e: Log.warning("not expected", cause=e) def __enter__(self): return self def __exit__(self, *exc_info): self.close() def close(self): self.response_worker.stop() def on_init(self): """Optional callback called at init.""" pass def start(self, argv=None): pass @property def conf(self): return self._config def task(self, **opts): opts = wrap(opts) this = self def dec(fun): # GET THE PARAMETER NAMES FOR args arg_names = fun.func_code.co_varnames[:fun.func_code.co_argcount] if arg_names and arg_names[0] == 'self': arg_names = arg_names[1:] this._tasks[opts.name] = fun def async(args, kwargs=None, *_args, **_kwargs): kwargs = set_default(kwargs, dict(zip(arg_names, args))) with self.id_lock: id = self.next_id self.next_id += 1 mail = deepcopy(Data( status=states.PENDING, caller={ # "stack": extract_stack(1) }, sender=set_default(_kwargs, opts), message=kwargs, request=set_default({"id": id}) )) output = AsyncResult(id, mail=mail, app=self) with self.responses_lock: self.responses[id] = output self.request_queue.add(value2json(mail)) Log.note("Added {{id}} ({{name}}) to request queue\n{{request}}", id=id, name=opts.name, request=mail) return output def send_message(*args, **kwargs): return async(args, kwargs) def revoke(terminate=True, signal='SIGINT'): pass setattr(send_message, "delay", send_message) setattr(send_message, "revoke", revoke) setattr(send_message, "apply_async", async) return send_message return dec def get_result(self, id): with self.responses_lock: response, self.responses[id] = self.responses[id], None response._ready.wait() if response.status in states.EXCEPTION_STATES: Log.error("bad response", cause=response.mail.result) else: return response.mail.result
class Sqlite(DB): """ Allows multi-threaded access Loads extension functions (like SQRT) """ canonical = None def __init__(self, filename=None, db=None, upgrade=True): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread-safe database """ if upgrade and not _upgraded: _upgrade() self.filename = filename self.db = db self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG self.upgrade = upgrade def _enhancements(self): def regex(pattern, value): return 1 if re.match(pattern+"$", value) else 0 con = self.db.create_function("regex", 2, regex) class Percentile(object): def __init__(self, percentile): self.percentile=percentile self.acc=[] def step(self, value): self.acc.append(value) def finalize(self): return percentile(self.acc, self.percentile) con.create_aggregate("percentile", 2, Percentile) def execute(self, command): """ COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS :param command: COMMAND FOR SQLITE :return: None """ if DEBUG: # EXECUTE IMMEDIATELY FOR BETTER STACK TRACE return self.query(command) if self.get_trace: trace = extract_stack(1) else: trace = None self.queue.add((command, None, None, trace)) def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ if not self.worker: self.worker = Thread.run("sqlite db thread", self._worker) signal = Signal() result = Data() self.queue.add((command, result, signal, None)) signal.wait() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def _worker(self, please_stop): global _load_extension_warning_sent if DEBUG: Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version) if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(coalesce(self.filename, ':memory:')) library_loc = File.new_instance(sys.modules[__name__].__file__, "../..") full_path = File.new_instance(library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath try: trace = extract_stack(0)[0] if self.upgrade: if os.name == 'nt': file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions.so") else: file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions") full_path = file.abspath self.db.enable_load_extension(True) self.db.execute("SELECT load_extension(" + self.quote_value(full_path) + ")") except Exception as e: if not _load_extension_warning_sent: _load_extension_warning_sent = True Log.warning("Could not load {{file}}}, doing without. (no SQRT for you!)", file=full_path, cause=e) try: while not please_stop: command, result, signal, trace = self.queue.pop(till=please_stop) if DEBUG_INSERT and command.strip().lower().startswith("insert"): Log.note("Running command\n{{command|indent}}", command=command) if DEBUG and not command.strip().lower().startswith("insert"): Log.note("Running command\n{{command|indent}}", command=command) with Timer("Run command", debug=DEBUG): if signal is not None: try: curr = self.db.execute(command) self.db.commit() result.meta.format = "table" result.header = [d[0] for d in curr.description] if curr.description else None result.data = curr.fetchall() if DEBUG and result.data: text = convert.table2csv(list(result.data)) Log.note("Result:\n{{data}}", data=text) except Exception as e: e = Except.wrap(e) result.exception = Except(ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally: signal.go() else: try: self.db.execute(command) self.db.commit() except Exception as e: e = Except.wrap(e) e.cause = Except( type=ERROR, template="Bad call to Sqlite", trace=trace ) Log.warning("Failure to execute", cause=e) except Exception as e: if not please_stop: Log.error("Problem with sql thread", e) finally: if DEBUG: Log.note("Database is closed") self.db.commit() self.db.close() def quote_column(self, column_name, table=None): return quote_column(column_name, table) def quote_value(self, value): return quote_value(value)
class Sqlite(DB): """ Allows multi-threaded access Loads extension functions (like SQRT) """ @override def __init__( self, filename=None, db=None, get_trace=None, upgrade=False, load_functions=False, debug=False, kwargs=None, ): """ :param filename: FILE TO USE FOR DATABASE :param db: AN EXISTING sqlite3 DB YOU WOULD LIKE TO USE (INSTEAD OF USING filename) :param get_trace: GET THE STACK TRACE AND THREAD FOR EVERY DB COMMAND (GOOD FOR DEBUGGING) :param upgrade: REPLACE PYTHON sqlite3 DLL WITH MORE RECENT ONE, WITH MORE FUNCTIONS (NOT WORKING) :param load_functions: LOAD EXTENDED MATH FUNCTIONS (MAY REQUIRE upgrade) :param kwargs: """ global _upgraded global _sqlite3 self.settings = kwargs if not _upgraded: if upgrade: _upgrade() _upgraded = True import sqlite3 as _sqlite3 _ = _sqlite3 self.filename = File(filename).abspath if filename else None if known_databases.get(self.filename): Log.error( "Not allowed to create more than one Sqlite instance for {{file}}", file=self.filename, ) self.debug = debug | DEBUG # SETUP DATABASE self.debug and Log.note("Sqlite version {{version}}", version=_sqlite3.sqlite_version) try: if db == None: self.db = _sqlite3.connect( database=coalesce(self.filename, ":memory:"), check_same_thread=False, isolation_level=None, ) else: self.db = db except Exception as e: Log.error("could not open file {{filename}}", filename=self.filename, cause=e) self.upgrade = upgrade load_functions and self._load_functions() self.locker = Lock() self.available_transactions = [ ] # LIST OF ALL THE TRANSACTIONS BEING MANAGED self.queue = Queue( "sql commands" ) # HOLD (command, result, signal, stacktrace) TUPLES self.get_trace = coalesce(get_trace, TRACE) self.closed = False # WORKER VARIABLES self.transaction_stack = [ ] # THE TRANSACTION OBJECT WE HAVE PARTIALLY RUN self.last_command_item = ( None ) # USE THIS TO HELP BLAME current_transaction FOR HANGING ON TOO LONG self.too_long = None self.delayed_queries = [] self.delayed_transactions = [] self.worker = Thread.run("sqlite db thread", self._worker) self.debug and Log.note( "Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0], ) def _enhancements(self): def regex(pattern, value): return 1 if re.match(pattern + "$", value) else 0 con = self.db.create_function("regex", 2, regex) class Percentile(object): def __init__(self, percentile): self.percentile = percentile self.acc = [] def step(self, value): self.acc.append(value) def finalize(self): return percentile(self.acc, self.percentile) con.create_aggregate("percentile", 2, Percentile) def transaction(self): thread = Thread.current() parent = None with self.locker: for t in self.available_transactions: if t.thread is thread: parent = t output = Transaction(self, parent=parent, thread=thread) self.available_transactions.append(output) return output def about(self, table_name): """ :param table_name: TABLE IF INTEREST :return: SOME INFORMATION ABOUT THE TABLE (cid, name, dtype, notnull, dfft_value, pk) tuples """ details = self.query("PRAGMA table_info" + sql_iso(quote_column(table_name))) return details.data def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ if self.closed: Log.error("database is closed") signal = _allocate_lock() signal.acquire() result = Data() trace = get_stacktrace(1) if self.get_trace else None if self.get_trace: current_thread = Thread.current() with self.locker: for t in self.available_transactions: if t.thread is current_thread: Log.error(DOUBLE_TRANSACTION_ERROR) self.queue.add(CommandItem(command, result, signal, trace, None)) signal.acquire() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def close(self): """ OPTIONAL COMMIT-AND-CLOSE IF THIS IS NOT DONE, THEN THE THREAD THAT SPAWNED THIS INSTANCE :return: """ self.closed = True signal = _allocate_lock() signal.acquire() self.queue.add(CommandItem(COMMIT, None, signal, None, None)) signal.acquire() self.worker.please_stop.go() return def __enter__(self): pass def __exit__(self, exc_type, exc_val, exc_tb): self.close() def _load_functions(self): global _load_extension_warning_sent library_loc = File.new_instance(sys.modules[__name__].__file__, "../..") full_path = File.new_instance( library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath try: trace = get_stacktrace(0)[0] if self.upgrade: if os.name == "nt": file = File.new_instance( trace["file"], "../../vendor/sqlite/libsqlitefunctions.so") else: file = File.new_instance( trace["file"], "../../vendor/sqlite/libsqlitefunctions") full_path = file.abspath self.db.enable_load_extension(True) self.db.execute( text(SQL_SELECT + "load_extension" + sql_iso(quote_value(full_path)))) except Exception as e: if not _load_extension_warning_sent: _load_extension_warning_sent = True Log.warning( "Could not load {{file}}, doing without. (no SQRT for you!)", file=full_path, cause=e, ) def create_new_functions(self): def regexp(pattern, item): reg = re.compile(pattern) return reg.search(item) is not None self.db.create_function("REGEXP", 2, regexp) def show_transactions_blocked_warning(self): blocker = self.last_command_item blocked = (self.delayed_queries + self.delayed_transactions)[0] Log.warning( "Query on thread {{blocked_thread|json}} at\n" "{{blocked_trace|indent}}" "is blocked by {{blocker_thread|json}} at\n" "{{blocker_trace|indent}}" "this message brought to you by....", blocker_trace=format_trace(blocker.trace), blocked_trace=format_trace(blocked.trace), blocker_thread=blocker.transaction.thread.name if blocker.transaction is not None else None, blocked_thread=blocked.transaction.thread.name if blocked.transaction is not None else None, ) def _close_transaction(self, command_item): query, result, signal, trace, transaction = command_item transaction.end_of_life = True with self.locker: self.available_transactions.remove(transaction) assert transaction not in self.available_transactions old_length = len(self.transaction_stack) old_trans = self.transaction_stack[-1] del self.transaction_stack[-1] assert old_length - 1 == len(self.transaction_stack) assert old_trans assert old_trans not in self.transaction_stack if not self.transaction_stack: # NESTED TRANSACTIONS NOT ALLOWED IN sqlite3 self.debug and Log.note(FORMAT_COMMAND, command=query) self.db.execute(query) has_been_too_long = False with self.locker: if self.too_long is not None: self.too_long, too_long = None, self.too_long # WE ARE CHEATING HERE: WE REACH INTO THE Signal MEMBERS AND REMOVE WHAT WE ADDED TO THE INTERNAL job_queue with too_long.lock: has_been_too_long = bool(too_long) too_long.job_queue = None # PUT delayed BACK ON THE QUEUE, IN THE ORDER FOUND, BUT WITH QUERIES FIRST if self.delayed_transactions: for c in reversed(self.delayed_transactions): self.queue.push(c) del self.delayed_transactions[:] if self.delayed_queries: for c in reversed(self.delayed_queries): self.queue.push(c) del self.delayed_queries[:] if has_been_too_long: Log.note("Transaction blockage cleared") def _worker(self, please_stop): try: # MAIN EXECUTION LOOP while not please_stop: command_item = self.queue.pop(till=please_stop) if command_item is None: break try: self._process_command_item(command_item) except Exception as e: Log.warning("worker can not execute command", cause=e) except Exception as e: e = Except.wrap(e) if not please_stop: Log.warning("Problem with sql", cause=e) finally: self.closed = True self.debug and Log.note("Database is closed") self.db.close() def _process_command_item(self, command_item): query, result, signal, trace, transaction = command_item with Timer("SQL Timing", verbose=self.debug): if transaction is None: # THIS IS A TRANSACTIONLESS QUERY, DELAY IT IF THERE IS A CURRENT TRANSACTION if self.transaction_stack: with self.locker: if self.too_long is None: self.too_long = Till( seconds=TOO_LONG_TO_HOLD_TRANSACTION) self.too_long.then( self.show_transactions_blocked_warning) self.delayed_queries.append(command_item) return elif self.transaction_stack and self.transaction_stack[-1] not in [ transaction, transaction.parent, ]: # THIS TRANSACTION IS NOT THE CURRENT TRANSACTION, DELAY IT with self.locker: if self.too_long is None: self.too_long = Till( seconds=TOO_LONG_TO_HOLD_TRANSACTION) self.too_long.then( self.show_transactions_blocked_warning) self.delayed_transactions.append(command_item) return else: # ENSURE THE CURRENT TRANSACTION IS UP TO DATE FOR THIS query if not self.transaction_stack: # sqlite3 ALLOWS ONLY ONE TRANSACTION AT A TIME self.debug and Log.note(FORMAT_COMMAND, command=BEGIN) self.db.execute(BEGIN) self.transaction_stack.append(transaction) elif transaction is not self.transaction_stack[-1]: self.transaction_stack.append(transaction) elif transaction.exception and query is not ROLLBACK: result.exception = Except( context=ERROR, template= "Not allowed to continue using a transaction that failed", cause=transaction.exception, trace=trace, ) signal.release() return try: transaction.do_all() except Exception as e: # DEAL WITH ERRORS IN QUEUED COMMANDS # WE WILL UNWRAP THE OUTER EXCEPTION TO GET THE CAUSE err = Except( context=ERROR, template="Bad call to Sqlite3 while " + FORMAT_COMMAND, params={"command": e.params.current.command}, cause=e.cause, trace=e.params.current.trace, ) transaction.exception = result.exception = err if query in [COMMIT, ROLLBACK]: self._close_transaction( CommandItem(ROLLBACK, result, signal, trace, transaction)) signal.release() return try: # DEAL WITH END-OF-TRANSACTION MESSAGES if query in [COMMIT, ROLLBACK]: self._close_transaction(command_item) return # EXECUTE QUERY self.last_command_item = command_item self.debug and Log.note(FORMAT_COMMAND, command=query) curr = self.db.execute(text(query)) result.meta.format = "table" result.header = ([d[0] for d in curr.description] if curr.description else None) result.data = curr.fetchall() if self.debug and result.data: csv = convert.table2csv(list(result.data)) Log.note("Result:\n{{data|limit(100)|indent}}", data=csv) except Exception as e: e = Except.wrap(e) err = Except( context=ERROR, template="Bad call to Sqlite while " + FORMAT_COMMAND, params={"command": query}, trace=trace, cause=e, ) result.exception = err if transaction: transaction.exception = err finally: signal.release()
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @override def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(kwargs=kwargs) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.es_index, c.names["."]) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: if DEBUG: Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical is not c: set_default(c.names, canonical.names) for key in Column.__slots__: canonical[key] = c[key] if DEBUG: Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE table_path = split_field(table) es_index = table_path[0] query_path = join_field(table_path[1:]) meta = self.es_metadata.indices[es_index] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[es_index] for _, properties in meta.mappings.items(): properties.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") and r.es_column.find("=") == -1 and r.es_column.find(" ") == -1 ) def add_column(c, query_path): c.last_updated = Date.now() if query_path[0] != ".": c.names[query_path[0]] = relative_field(c.names["."], query_path[0]) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.es_index = alias self._upsert_column(c) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: for query_path in query_paths: add_column(abs_column, query_path) def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field(query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table( name=es_index_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name) def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in STRUCT: Log.error("not supported") try: if c.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return if c.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return es_index = c.es_index.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": {c.names["."]: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None) if cardinality == None: Log.error("logic error") query = Data(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif len(c.nested_path) != 1: query.aggs[literal_field(c.names["."])] = { "nested": {"path": c.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}} } else: query.aggs[literal_field(c.names["."])] = {"terms": {"field": c.es_column, "size": 0}} result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) except Exception as e: if "IndexMissingException" in e and c.es_index.startswith(TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear":[ "partitions" ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": {"eq": {"names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: with self.meta.columns.locker: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT ] if old_columns: if DEBUG: Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: if DEBUG: Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if DEBUG: Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column.type in STRUCT: with self.meta.columns.locker: column.last_updated = Date.now() continue elif column.last_updated >= Date.now()-TOO_OLD: continue try: self._update_cardinality(column) if DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX): Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD: continue with self.meta.columns.locker: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear":[ "count", "cardinality", "partitions", ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) if DEBUG: Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
class Extract(object): @override def __init__(self, kwargs=None): self.settings = kwargs self.schema = SnowflakeSchema(self.settings.snowflake) self._extract = extract = kwargs.extract # SOME PREP get_git_revision() # VERIFY WE DO NOT HAVE TOO MANY OTHER PROCESSES WORKING ON STUFF with MySQL(**kwargs.snowflake.database) as db: processes = None try: processes = jx.filter( db.query("show processlist"), { "and": [{ "neq": { "Command": "Sleep" } }, { "neq": { "Info": "show processlist" } }] }) except Exception as e: Log.warning("no database", cause=e) if processes: if DEBUG: Log.warning("Processes are running\n{{list|json}}", list=processes) else: Log.error("Processes are running\n{{list|json}}", list=processes) extract.type = listwrap(extract.type) extract.start = listwrap(extract.start) extract.batch = listwrap(extract.batch) extract.field = listwrap(extract.field) if any( len(extract.type) != len(other) for other in [extract.start, extract.batch, extract.field]): Log.error( "Expecting same number of dimensions for `type`, `start`, `batch`, and `field` in the `extract` inner object" ) for i, t in enumerate(extract.type): if t == "time": extract.start[i] = Date(extract.start[i]) extract.batch[i] = Duration(extract.batch[i]) elif t == "number": pass else: Log.error('Expecting `extract.type` to be "number" or "time"') extract.threads = coalesce(extract.threads, 1) self.done_pulling = Signal() self.queue = Queue("all batches", max=2 * coalesce(extract.threads, 1), silent=True) self.bucket = s3.Bucket(self.settings.destination) self.notify = aws.Queue(self.settings.notify) Thread.run("get records", self.pull_all_remaining) def pull_all_remaining(self, please_stop): try: try: content = File(self.settings.extract.last).read_json() if len(content) == 1: Log.note("Got a manually generated file {{filename}}", filename=self.settings.extract.last) start_point = tuple(content[0]) first_value = [ self._extract.start[0] + (start_point[0] * DAY), start_point[1] ] else: Log.note("Got a machine generated file {{filename}}", filename=self.settings.extract.last) start_point, first_value = content start_point = tuple(start_point) Log.note("First value is {{start1|date}}, {{start2}}", start1=first_value[0], start2=first_value[1]) except Exception as _: Log.error( "Expecting a file {{filename}} with the last good S3 bucket etl id in array form eg: [[954, 0]]", filename=self.settings.extract.last) start_point = tuple(self._extract.start) first_value = Null counter = Counter(start=0) for t, s, b, f, i in reversed( zip(self._extract.type, self._extract.start, self._extract.batch, listwrap(first_value) + DUMMY_LIST, range(len(self._extract.start)))): if t == "time": counter = DurationCounter(start=s, duration=b, child=counter) first_value[i] = Date(f) else: counter = BatchCounter(start=s, size=b, child=counter) batch_size = self._extract.batch.last( ) * 2 * self.settings.extract.threads with MySQL(**self.settings.snowflake.database) as db: while not please_stop: sql = self._build_list_sql(db, first_value, batch_size + 1) pending = [] counter.reset(start_point) with Timer("Grab a block of ids for processing"): with closing(db.db.cursor()) as cursor: acc = [] cursor.execute(sql) count = 0 for row in cursor: detail_key = counter.next(row) key = tuple(detail_key[:-1]) count += 1 if key != start_point: if first_value: if not acc: Log.error( "not expected, {{filename}} is probably set too far in the past", filename=self.settings.extract. last) pending.append({ "start_point": start_point, "first_value": first_value, "data": acc }) acc = [] start_point = key first_value = row acc.append( row[-1] ) # ASSUME LAST COLUMN IS THE FACT TABLE id Log.note("adding {{num}} for processing", num=len(pending)) self.queue.extend(pending) if count < batch_size: self.queue.add(THREAD_STOP) break except Exception as e: Log.warning("Problem pulling data", cause=e) finally: self.done_pulling.go() Log.note("pulling new data is done") def _build_list_sql(self, db, first, batch_size): # TODO: ENSURE THE LAST COLUMN IS THE id if first: dim = len(self._extract.field) where = SQL_OR.join( sql_iso( sql_and( quote_column(f) + ineq(i, e, dim) + db.quote_value(Date(v) if t == "time" else v) for e, (f, v, t) in enumerate( zip(self._extract.field[0:i + 1:], first, self._extract.type[0:i + 1:])))) for i in range(dim)) else: where = SQL_TRUE selects = [] for t, f in zip(self._extract.type, self._extract.field): if t == "time": selects.append( "CAST" + sql_iso(sql_alias(quote_column(f), SQL("DATETIME(6)")))) else: selects.append(quote_column(f)) sql = (SQL_SELECT + sql_list(selects) + SQL_FROM + self.settings.snowflake.fact_table + SQL_WHERE + where + SQL_ORDERBY + sql_list(quote_column(f) for f in self._extract.field) + SQL_LIMIT + db.quote_value(batch_size)) return sql def extract(self, db, start_point, first_value, data, please_stop): Log.note( "Starting scan of {{table}} at {{id}} and sending to batch {{start_point}}", table=self.settings.snowflake.fact_table, id=first_value, start_point=start_point) id = quote_column(self._extract.field.last()) ids = (SQL_SELECT + id + SQL_FROM + self.settings.snowflake.fact_table + SQL_WHERE + id + " in " + sql_iso(sql_list(map(db.quote_value, data)))) sql = self.schema.get_sql(ids) with Timer("Sending SQL"): cursor = db.query(sql, stream=True, row_tuples=True) extract = self.settings.extract fact_table = self.settings.snowflake.fact_table with TempFile() as temp_file: parent_etl = None for s in start_point: parent_etl = {"id": s, "source": parent_etl} parent_etl["revision"] = get_git_revision() parent_etl["machine"] = machine_metadata def append(value, i): """ :param value: THE DOCUMENT TO ADD :return: PleaseStop """ temp_file.append( convert.value2json({ fact_table: elasticsearch.scrub(value), "etl": { "id": i, "source": parent_etl, "timestamp": Date.now() } })) with Timer("assemble data"): self.construct_docs(cursor, append, please_stop) # WRITE TO S3 s3_file_name = ".".join(map(text_type, start_point)) with Timer("write to destination {{filename}}", param={"filename": s3_file_name}): if not isinstance(self.settings.destination, text_type): destination = self.bucket.get_key(s3_file_name, must_exist=False) destination.write_lines(temp_file) else: destination = File(self.settings.destination) destination.write( convert.value2json( [convert.json2value(o) for o in temp_file], pretty=True)) return False # NOTIFY SQS now = Date.now() self.notify.add({ "bucket": self.settings.destination.bucket, "key": s3_file_name, "timestamp": now.unix, "date/time": now.format() }) # SUCCESS!! File(extract.last).write(convert.value2json([start_point, first_value])) def construct_docs(self, cursor, append, please_stop): """ :param cursor: ITERATOR OF RECORDS :param append: METHOD TO CALL WITH CONSTRUCTED DOCUMENT :return: (count, first, next, next_key) number of documents added the first document in the batch the first document of the next batch """ null_values = set(self.settings.snowflake.null_values) | {None} count = 0 rownum = 0 columns = tuple(wrap(c) for c in self.schema.columns) with Timer("Downloading from MySQL"): curr_record = Null for rownum, row in enumerate(cursor): if please_stop: Log.error("Got `please_stop` signal") nested_path = [] next_record = None for c, value in zip(columns, row): if value in null_values: continue if len(nested_path) < len(c.nested_path): nested_path = unwrap(c.nested_path) next_record = Data() next_record[c.put] = value if len(nested_path) > 1: path = nested_path[-2] children = curr_record[path] if children == None: children = curr_record[path] = wrap([]) if len(nested_path) > 2: parent_path = path for path in list(reversed(nested_path[0:-2:])): parent = children.last() relative_path = relative_field(path, parent_path) children = parent[relative_path] if children == None: children = parent[relative_path] = wrap([]) parent_path = path children.append(next_record) continue if curr_record == next_record: Log.error("not expected") if curr_record: append(curr_record["id"], count) count += 1 curr_record = next_record # DEAL WITH LAST RECORD if curr_record: append(curr_record["id"], count) count += 1 Log.note("{{num}} documents ({{rownum}} db records)", num=count, rownum=rownum)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @override def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(kwargs=kwargs) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.table, c.name) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: if DEBUG: Log.note("todo: {{table}}::{{column}}", table=c.table, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical.relative and not c.relative: return # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS for key in Column.__slots__: canonical[key] = c[key] if DEBUG: Log.note("todo: {{table}}::{{column}}", table=canonical.table, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE meta = self.es_metadata.indices[table] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[table] self._parse_properties(meta.index, Data(properties={"_id": {"type": "string", "index": "not_analyzed"}}), meta) for _, properties in meta.mappings.items(): self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") and r.es_column.find("=")==-1 and r.es_column.find(" ")==-1 ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index]+split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias]+split_field(query_path[0])) self._upsert_column(c) # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: full_path = abs_column.nested_path abs_depth = len(full_path)-1 abs_parent = full_path[1] if abs_depth else "" for query_path in query_paths: rel_depth = len(query_path)-1 rel_parent = query_path[0] rel_column = copy(abs_column) rel_column.relative = True add_column(copy(abs_column), query_path) if rel_parent == ".": add_column(rel_column, query_path) elif abs_column.es_column.startswith(rel_parent+"."): rel_column.name = abs_column.es_column[len(rel_parent)+1:] add_column(rel_column, query_path) elif abs_column.es_column == rel_parent: rel_column.name = "." add_column(rel_column, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent.startswith(abs_parent+"."): rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error") def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table( name=short_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception, e: Log.error("Not expected", cause=e) if column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name) else: self._get_columns(table=table_name) Log.error("no columns for {{table}}?!", table=table_name)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): if jx_base_meta.singlton: return jx_base_meta.singlton else: jx_base_meta.singlton = object.__new__(cls) return jx_base_meta.singlton @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.abs_columns = set() self.last_es_metadata = Date.now() - OLD_METADATA self.meta = Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap( [t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.es_index, c.names["."]) for canonical in existing_columns: if canonical.type == c.type and canonical is not c: set_default(c.names, canonical.names) for key in Column.__slots__: canonical[key] = c[key] if DEBUG: Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column) self.todo.add(canonical) break else: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: if DEBUG: Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() - TOO_OLD self.todo.extend(cols) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE table_path = split_field(table) es_index = table_path[0] meta = self.es_metadata.indices[es_index] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[es_index] for data_type, properties in meta.mappings.items(): if data_type == "_default_": continue properties.properties["_id"] = { "type": "string", "index": "not_analyzed" } self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES def add_column(c, query_path): c.last_updated = Date.now() - TOO_OLD if query_path[0] != ".": c.names[query_path[0]] = relative_field( c.names["."], query_path[0]) with self.meta.columns.locker: for alias in meta.aliases: c_ = copy(c) c_.es_index = alias self._upsert_column(c_) self._upsert_column(c) abs_columns = elasticsearch.parse_properties(abs_index, None, properties.properties) self.abs_columns.update(abs_columns) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(SELF_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: abs_column = abs_column.__copy__() abs_column.type = es_type_to_json_type[abs_column.type] for query_path in query_paths: add_column(abs_column, query_path) pass def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__()))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field( query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table(name=es_index_name, url=None, query_path=['.'], timestamp=Date.now()) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now( ) - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name) def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby( self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby( self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.abs_columns if cc.es_column == column.es_column and cc.type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = 1001 multi = 1001 elif column.es_column == "_id": result = self.default_es.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 else: result = self.default_es.post( "/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "size": 0 }) r = result.aggregations.count count = result.hits.total cardinality = coalesce(r.value, r._nested.value, r.doc_count) multi = coalesce(r.multi.value, 1) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=column.es_index, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=column.names["."], parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = any( column.es_index.startswith(t) for t in [TEST_TABLE_PREFIX, TEST_TABLE]) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR with self.meta.columns.locker: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "names.\\.": ".", "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: with self.meta.columns.locker: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now() - TOO_OLD) and c.type not in STRUCT ] if old_columns: if DEBUG: Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[ Date(t).format() for t in wrap(old_columns).last_updated ]) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: if DEBUG: Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds)) if DEBUG: Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column: if column.es_index in self.index_does_not_exist: with self.meta.columns.locker: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) continue if column.type in STRUCT or column.es_column.endswith( "." + EXISTS_TYPE): with self.meta.columns.locker: column.last_updated = Date.now() continue elif column.last_updated >= Date.now() - TOO_OLD: continue try: self._update_cardinality(column) if DEBUG and not column.es_index.startswith( TEST_TABLE_PREFIX): Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if not c.last_updated or c.last_updated >= Date.now() - TOO_OLD: continue with self.meta.columns.locker: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) if DEBUG: Log.note( "Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ @override def __new__(cls, kwargs, *args, **_kwargs): es_cluster = elasticsearch.Cluster(kwargs) output = known_clusters.get(id(es_cluster)) if output is None: output = object.__new__(cls) known_clusters[id(es_cluster)] = output return output @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = {} self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList(URL(self.es_cluster.settings.host).host) self.alias_to_query_paths = { "meta.columns": [ROOT_PATH], "meta.tables": [ROOT_PATH] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer("meta.tables", [], jx_base.Schema(".", table_columns)) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("not refresh metadata", self.not_monitor) return @property def namespace(self): return self.meta.columns.namespace @property def url(self): return self.es_cluster.url / self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index es_metadata_update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata( force=es_metadata_update_required) props = [(self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if alias in d.aliases for t, m in [_get_best_type_from_mapping(d.mappings)]] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list( jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata( force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} columns = self._parse_properties(alias, mapping) table_desc.timestamp = es_last_updated return columns def _parse_properties(self, alias, mapping): abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties) if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns): Log.warning( "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}", url=self.es_cluster.url, index=alias, names=[ ".".join((c.es_index, c.name)) for c in abs_columns if c.cardinality == 0 ]) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ENSURE ALL TABLES HAVE THE QUERY PATHS SET self.alias_to_query_paths[alias] = query_paths for i, a in self.index_to_alias.items(): if a == alias: self.alias_to_query_paths[i] = query_paths # ENSURE COLUMN HAS CORRECT jx_type # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE output = [] best = {} for abs_column in abs_columns: abs_column.jx_type = jx_type(abs_column) if abs_column.jx_type not in STRUCT: clean_name = unnest_path(abs_column.name) other = best.get(clean_name) if other: if len(other.nested_path) < len( abs_column.nested_path): output.remove(other) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_column": other.es_column, "es_index": other.es_index } } }) else: continue best[clean_name] = abs_column output.append(abs_column) # REGISTER ALL COLUMNS canonicals = [] for abs_column in output: canonical = self.meta.columns.add(abs_column) canonicals.append(canonical) self.todo.extend(canonicals) return canonicals def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__()))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias.get(name) def get_columns(self, table_name, column_name=None, after=None, timeout=None): """ RETURN METADATA COLUMNS :param table_name: TABLE WE WANT COLUMNS FOR :param column_name: OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME :param timeout: Signal; True when should give up :return: """ DEBUG and after and Log.note("getting columns for after {{time}}", time=after) table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc(name=alias, url=None, query_path=["."], timestamp=Date.MIN) with self.meta.tables.locker: self.meta.tables.add(table) columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") elif after or table.timestamp < self.es_cluster.metatdata_last_updated: columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") else: columns = self.meta.columns.find(alias, column_name) DEBUG and Log.note("columns from find()") DEBUG and Log.note("columns are {{ids}}", ids=[id(c) for c in columns]) columns = jx.sort(columns, "name") if after is None: return columns # DO NOT WAIT FOR COMPLETE COLUMNS # WAIT FOR THE COLUMNS TO UPDATE while True: pending = [ c for c in columns if after >= c.last_updated or ( c.cardinality == None and c.jx_type not in STRUCT) ] if not pending: break if timeout: Log.error("trying to gets columns timed out") if DEBUG: if len(pending) > 10: Log.note( "waiting for {{num}} columns to update by {{timestamp}}", num=len(pending), timestamp=after) else: Log.note( "waiting for columns to update by {{timestamp}}; {{columns|json}}", timestamp=after, columns=[ c.es_index + "." + c.es_column + " id=" + text_type(id(c)) for c in pending ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Failure to get columns for {{table}}", table=table_name, cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ now = Date.now() if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": [False, True], "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return else: es_query = { "aggs": { "count": _counting_query(column), "_filter": { "aggs": { "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "filter": { "bool": { "should": [{ "range": { "etl.timestamp.~n~": { "gte": (Date.today() - WEEK) } } }, { "bool": { "must_not": { "exists": { "field": "etl.timestamp.~n~" } } } }] } } } }, "size": 0 } result = self.es_cluster.post("/" + es_index + "/_search", data=es_query) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results._filter.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: # WHEN DOES THIS HAPPEN? query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) DEBUG and Log.note( "update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith( (TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index: # WE EXPECT TEST TABLES TO DISAPPEAR Log.warning("Missing index {{col.es_index}}", col=column, cause=e) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) elif "No field found for" in e: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) else: self.meta.columns.update({ "set": { "last_updated": now }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[ Date(t).format() for t in wrap(old_columns).last_updated ]) self.todo.extend(old_columns) else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={ "table": column.es_index, "column": column.es_column }, silent=not DEBUG): if column.es_index in self.index_does_not_exist: DEBUG and Log.note( "{{column.es_column}} does not exist", column=column) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) continue if column.jx_type in STRUCT or split_field( column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note( "{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now( ) - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note( "{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now() - Date(column.last_updated)).seconds) continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX) ) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) else: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: column = self.todo.pop() if column == THREAD_STOP: break if column.jx_type in STRUCT or split_field( column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now( ) - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note( "{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now() - Date(column.last_updated)).seconds) continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if untype_path(column.name) in ["build.type", "run.type"]: try: self._update_cardinality(column) except Exception as e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) else: column.last_updated = Date.now() def get_table(self, name): if name == "meta.columns": return self.meta.columns with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema if name == "meta.tables": return self.meta.tables root, rest = tail_field(name) return self.get_snowflake(root).get_schema(rest)