class StructuredLogger_usingQueue(StructuredLogger): def __init__(self, name=None): queue_name = "log messages to queue" if name: queue_name += " " + name self.queue = Queue(queue_name) def write(self, template, params): self.queue.add(expand_template(template, params)) def stop(self): self.queue.close() def pop(self): lines = self.queue.pop() output = [] for l in lines.split(CR): # REMOVE FIRST PART, THE TIMESTAMP # 0123456789012345678901234567890 # 2019-01-06 19:13:49.937542 - prefix = re.match(DATE_PATTERN, l) if prefix: l = l[len(prefix.group(0)):] if not l.strip(): continue if l.strip().startswith("File"): continue output.append(l) return CR.join(output).strip()
class StructuredLogger_usingQueue(StructuredLogger): def __init__(self, name=None): queue_name = "log messages to queue" if name: queue_name += " " + name self.queue = Queue(queue_name) def write(self, template, params): self.queue.add(expand_template(template, params)) def stop(self): self.queue.close() def pop(self): lines = self.queue.pop() output = [] for l in lines.split("\n"): if l[19:22] == " - ": l = l[22:] if l.strip().startswith("File"): continue output.append(l) return "\n".join(output).strip()
class StructuredLogger_usingQueue(StructuredLogger): def __init__(self, name=None): queue_name = "log messages to queue" if name: queue_name += " "+name self.queue = Queue(queue_name) def write(self, template, params): self.queue.add(expand_template(template, params)) def stop(self): self.queue.close() def pop(self): lines = self.queue.pop() output = [] for l in lines.split("\n"): if l[19:22] == " - ": l = l[22:] if l.strip().startswith("File"): continue output.append(l) return "\n".join(output).strip()
class ColumnList(Table, Container): """ CENTRAL CONTAINER FOR ALL COLUMNS SYNCHRONIZED WITH ELASTICSEARCH OPTIMIZED FOR THE PARTICULAR ACCESS PATTERNS USED """ def __init__(self, es_cluster): Table.__init__(self, META_COLUMNS_NAME) self.data = {} # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS) self.locker = Lock() self._schema = None self.dirty = False self.es_cluster = es_cluster self.es_index = None self.last_load = Null self.for_es_update = Queue( "update columns to es" ) # HOLD (action, column) PAIR, WHERE action in ['insert', 'update'] self._db_load() self.delete_queue = Queue( "delete columns from es") # CONTAINS (es_index, after) PAIRS Thread.run("update " + META_COLUMNS_NAME, self._update_from_es, parent_thread=MAIN_THREAD).release() Thread.run("delete columns", self._delete_columns, parent_thread=MAIN_THREAD).release() def _query(self, query): result = Data() curr = self.es_cluster.execute(query) result.meta.format = "table" result.header = [d[0] for d in curr.description ] if curr.description else None result.data = curr.fetchall() return result def _db_create(self): schema = { "settings": { "index.number_of_shards": 1, "index.number_of_replicas": REPLICAS }, "mappings": { META_COLUMNS_TYPE_NAME: {} }, } self.es_index = self.es_cluster.create_index(id=ID, index=META_COLUMNS_NAME, schema=schema) self.es_index.add_alias(META_COLUMNS_NAME) for c in META_COLUMNS_DESC.columns: self._add(c) self.es_index.add({"value": c.__dict__()}) def _db_load(self): self.last_load = Date.now() try: self.es_index = self.es_cluster.get_index( id=ID, index=META_COLUMNS_NAME, type=META_COLUMNS_TYPE_NAME, read_only=False) result = self.es_index.search({ "query": { "bool": { "should": [ { "bool": { "must_not": { "exists": { "field": "cardinality.~n~" } } } }, { # ASSUME UNUSED COLUMNS DO NOT EXIST "range": { "cardinality.~n~": { "gte": 0 } } }, ] } }, "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"], "size": 10000, }) with Timer("adding columns to structure"): for r in result.hits.hits._source: col = doc_to_column(r) if col: self._add(col) Log.note("{{num}} columns loaded", num=result.hits.total) if not self.data.get(META_COLUMNS_NAME): Log.error("metadata missing from index!") except Exception as e: metadata = self.es_cluster.get_metadata(after=Date.now()) if any( index.startswith(META_COLUMNS_NAME) for index in metadata.indices.keys()): Log.error("metadata already exists!", cause=e) Log.warning("no {{index}} exists, making one", index=META_COLUMNS_NAME, cause=e) self._db_create() def delete_from_es(self, es_index, after): """ DELETE COLUMNS STORED IN THE ES INDEX :param es_index: :param after: ONLY DELETE RECORDS BEFORE THIS TIME :return: """ self.delete_queue.add((es_index, after)) def _delete_columns(self, please_stop): while not please_stop: result = self.delete_queue.pop(till=please_stop) if result == THREAD_STOP: break more_result = self.delete_queue.pop_all() results = [result] + more_result try: delete_result = self.es_index.delete_record({ "bool": { "should": [{ "bool": { "must": [{ "term": { "es_index.~s~": es_index } }, { "range": { "last_updated.~n~": { "lte": after.unix } } }] } } for es_index, after in results] } }) if DEBUG: query = { "query": { "terms": { "es_index.~s~": [es_index for es_index, after in results] } } } verify = self.es_index.search(query) while verify.hits.total: Log.note("wait for columns to be gone") verify = self.es_index.search(query) Log.note( "Deleted {{delete_result}} columns from {{table}}", table=[es_index for es_index, after in results], delete_result=delete_result.deleted) except Exception as cause: Log.warning("Problem with delete of table", cause=cause) Till(seconds=1).wait() def _update_from_es(self, please_stop): try: last_extract = Date.now() while not please_stop: now = Date.now() try: if (now - last_extract).seconds > COLUMN_EXTRACT_PERIOD: result = self.es_index.search({ "query": { "range": { "last_updated.~n~": { "gte": self.last_load } } }, "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"], "from": 0, "size": 10000, }) last_extract = now with self.locker: for r in result.hits.hits._source: c = doc_to_column(r) if c: self._add(c) self.last_load = MAX( (self.last_load, c.last_updated)) while not please_stop: updates = self.for_es_update.pop_all() if not updates: break DEBUG and updates and Log.note( "{{num}} columns to push to db", num=len(updates)) self.es_index.extend([{ "value": column.__dict__() } for column in updates]) except Exception as e: Log.warning("problem updating database", cause=e) (Till(seconds=COLUMN_LOAD_PERIOD) | please_stop).wait() finally: Log.note("done") def find(self, es_index, abs_column_name=None): with self.locker: if es_index.startswith("meta."): self._update_meta() if not abs_column_name: return [ c for cs in self.data.get(es_index, {}).values() for c in cs ] else: return self.data.get(es_index, {}).get(abs_column_name, []) def extend(self, columns): self.dirty = True with self.locker: for column in columns: self._add(column) def add(self, column): self.dirty = True with self.locker: canonical = self._add(column) if canonical == None: return column # ALREADY ADDED self.for_es_update.add(canonical) return canonical def remove(self, column, after): if column.last_updated > after: return mark_as_deleted(column, after) with self.locker: canonical = self._add(column) if canonical: Log.error("Expecting canonical column to be removed") DEBUG and Log.note("delete {{col|quote}}, at {{timestamp}}", col=column.es_column, timestamp=column.last_updated) self.for_es_update.add(column) def remove_table(self, table_name): del self.data[table_name] def _add(self, column): """ :param column: ANY COLUMN OBJECT :return: None IF column IS canonical ALREADY (NET-ZERO EFFECT) """ if not isinstance(column, Column): Log.warning("expecting a column not {{column|json}}", column=column) return columns_for_table = self.data.setdefault(column.es_index, {}) existing_columns = columns_for_table.setdefault(column.name, []) for canonical in existing_columns: if canonical is column: return None if canonical.es_type == column.es_type: if column.last_updated > canonical.last_updated: for key in Column.__slots__: old_value = canonical[key] new_value = column[key] if new_value == old_value: pass # NO NEED TO UPDATE WHEN NO CHANGE MADE (COMMON CASE) else: canonical[key] = new_value return canonical existing_columns.append(column) return column def _update_meta(self): if not self.dirty: return now = Date.now() for mc in META_COLUMNS_DESC.columns: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.name] if value == None: pass else: count += 1 if is_list(value): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif is_data(value): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = now META_COLUMNS_DESC.last_updated = now self.dirty = False def _all_columns(self): return [ column for t, cs in self.data.items() for _, css in cs.items() for column in css ] def __iter__(self): with self.locker: self._update_meta() return iter(self._all_columns()) def __len__(self): return self.data[META_COLUMNS_NAME]["es_index"].count def clear(self, es_index, es_column=None, after=None): if es_column: for c in self.data.get(es_index, {}).get(es_column, []): self.remove(c, after=after) return data = self.data with self.locker: cols = data.get(es_index) if not cols: return del data[es_index] for c in cols.values(): for cc in c: mark_as_deleted(cc, after=after) def update(self, command): self.dirty = True try: command = to_data(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": d = self.data i = eq.es_index with self.locker: cols = d[i] del d[i] for c in cols: self.remove(c) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all(c[k] == v for k, v in eq.items()) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": mark_as_deleted(col, Date.now()) self.for_es_update.add(col) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.for_es_update.add(col) except Exception as e: Log.error("should not happen", cause=e) def query(self, query): # NOT EXPECTED TO BE RUN Log.error("not") with self.locker: self._update_meta() if not self._schema: self._schema = Schema(".", [ c for cs in self.data[META_COLUMNS_NAME].values() for c in cs ]) snapshot = self._all_columns() from jx_python.containers.list import ListContainer query.frum = ListContainer(META_COLUMNS_NAME, snapshot, self._schema) return jx.run(query) def groupby(self, keys): with self.locker: self._update_meta() return jx.groupby(self.__iter__(), keys) def window(self, window): raise NotImplemented() @property def schema(self): if not self._schema: with self.locker: self._update_meta() self._schema = Schema(".", [ c for cs in self.data[META_COLUMNS_NAME].values() for c in cs ]) return self._schema @property def namespace(self): return self def get_table(self, table_name): if table_name != META_COLUMNS_NAME: Log.error("this container has only the " + META_COLUMNS_NAME) return self def get_columns(self, table_name): if table_name != META_COLUMNS_NAME: Log.error("this container has only the " + META_COLUMNS_NAME) return self._all_columns() def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ with self.locker: self._update_meta() output = [ { "table": c.es_index, "name": untype_path(c.name), "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "es_type": c.es_type, "type": c.jx_type, } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.jx_type not in INTERNAL # and c.es_column != "_id" ] from jx_python.containers.list import ListContainer return ListContainer( self.name, data=output, schema=BaseSchema(META_COLUMNS_NAME, SIMPLE_METADATA_COLUMNS), )
class Sqlite(DB): """ Allows multi-threaded access Loads extension functions (like SQRT) """ canonical = None def __init__(self, filename=None, db=None, upgrade=True): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread-safe database """ if upgrade and not _upgraded: _upgrade() self.filename = filename self.db = db self.queue = Queue( "sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG self.upgrade = upgrade def _enhancements(self): def regex(pattern, value): return 1 if re.match(pattern + "$", value) else 0 con = self.db.create_function("regex", 2, regex) class Percentile(object): def __init__(self, percentile): self.percentile = percentile self.acc = [] def step(self, value): self.acc.append(value) def finalize(self): return percentile(self.acc, self.percentile) con.create_aggregate("percentile", 2, Percentile) def execute(self, command): """ COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS :param command: COMMAND FOR SQLITE :return: None """ if DEBUG: # EXECUTE IMMEDIATELY FOR BETTER STACK TRACE return self.query(command) if self.get_trace: trace = extract_stack(1) else: trace = None self.queue.add((command, None, None, trace)) def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ if not self.worker: self.worker = Thread.run("sqlite db thread", self._worker) signal = Signal() result = Data() self.queue.add((command, result, signal, None)) signal.wait() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def _worker(self, please_stop): global _load_extension_warning_sent if DEBUG: Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version) if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(coalesce(self.filename, ':memory:')) library_loc = File.new_instance(sys.modules[__name__].__file__, "../..") full_path = File.new_instance( library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath try: trace = extract_stack(0)[0] if self.upgrade: if os.name == 'nt': file = File.new_instance( trace["file"], "../../vendor/sqlite/libsqlitefunctions.so") else: file = File.new_instance( trace["file"], "../../vendor/sqlite/libsqlitefunctions") full_path = file.abspath self.db.enable_load_extension(True) self.db.execute("SELECT load_extension(" + self.quote_value(full_path) + ")") except Exception as e: if not _load_extension_warning_sent: _load_extension_warning_sent = True Log.warning( "Could not load {{file}}}, doing without. (no SQRT for you!)", file=full_path, cause=e) try: while not please_stop: command, result, signal, trace = self.queue.pop( till=please_stop) if DEBUG_INSERT and command.strip().lower().startswith( "insert"): Log.note("Running command\n{{command|indent}}", command=command) if DEBUG and not command.strip().lower().startswith("insert"): Log.note("Running command\n{{command|indent}}", command=command) with Timer("Run command", debug=DEBUG): if signal is not None: try: curr = self.db.execute(command) self.db.commit() result.meta.format = "table" result.header = [d[0] for d in curr.description ] if curr.description else None result.data = curr.fetchall() if DEBUG and result.data: text = convert.table2csv(list(result.data)) Log.note("Result:\n{{data}}", data=text) except Exception as e: e = Except.wrap(e) result.exception = Except( ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally: signal.go() else: try: self.db.execute(command) self.db.commit() except Exception as e: e = Except.wrap(e) e.cause = Except(type=ERROR, template="Bad call to Sqlite", trace=trace) Log.warning("Failure to execute", cause=e) except Exception as e: if not please_stop: Log.error("Problem with sql thread", e) finally: if DEBUG: Log.note("Database is closed") self.db.commit() self.db.close() def quote_column(self, column_name, table=None): return quote_column(column_name, table) def quote_value(self, value): return quote_value(value)
class Sqlite(DB): """ Allows multi-threaded access Loads extension functions (like SQRT) """ canonical = None @override def __init__(self, filename=None, db=None, upgrade=True, load_functions=False, kwargs=None): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread-safe database """ if upgrade and not _upgraded: _upgrade() self.settings = kwargs self.filename = File(filename).abspath self.db = db self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = TRACE self.upgrade = upgrade self.closed = False if DEBUG: Log.note("Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0]) def _enhancements(self): def regex(pattern, value): return 1 if re.match(pattern+"$", value) else 0 con = self.db.create_function("regex", 2, regex) class Percentile(object): def __init__(self, percentile): self.percentile=percentile self.acc=[] def step(self, value): self.acc.append(value) def finalize(self): return percentile(self.acc, self.percentile) con.create_aggregate("percentile", 2, Percentile) def execute(self, command): """ COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS :param command: COMMAND FOR SQLITE :return: Signal FOR IF YOU WANT TO BE NOTIFIED WHEN DONE """ if self.closed: Log.error("database is closed") if DEBUG_EXECUTE: # EXECUTE IMMEDIATELY FOR BETTER STACK TRACE self.query(command) return DONE if self.get_trace: trace = extract_stack(1) else: trace = None is_done = Signal() self.queue.add((command, None, is_done, trace)) return is_done def commit(self): """ WILL BLOCK CALLING THREAD UNTIL ALL PREVIOUS execute() CALLS ARE COMPLETED :return: """ if self.closed: Log.error("database is closed") signal = _allocate_lock() signal.acquire() self.queue.add((COMMIT, None, signal, None)) signal.acquire() return def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ if self.closed: Log.error("database is closed") if not self.worker: self.worker = Thread.run("sqlite db thread", self._worker) signal = _allocate_lock() signal.acquire() result = Data() self.queue.add((command, result, signal, None)) signal.acquire() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def close(self): """ OPTIONAL COMMIT-AND-CLOSE IF THIS IS NOT DONE, THEN THE THREAD THAT SPAWNED THIS INSTANCE :return: """ self.closed = True signal = _allocate_lock() signal.acquire() self.queue.add((COMMIT, None, signal, None)) signal.acquire() self.worker.please_stop.go() return def __enter__(self): pass def __exit__(self, exc_type, exc_val, exc_tb): self.close() def _worker(self, please_stop): global _load_extension_warning_sent try: if DEBUG: Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version) try: if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(coalesce(self.filename, ':memory:'), check_same_thread=False) except Exception as e: Log.error("could not open file {{filename}}", filename=self.filename) if self.settings.load_functions: self._load_functions() while not please_stop: quad = self.queue.pop(till=please_stop) if quad is None: break command, result, signal, trace = quad show_timing = False if DEBUG_INSERT and command.strip().lower().startswith("insert"): Log.note("Running command\n{{command|limit(100)|indent}}", command=command) show_timing = True if DEBUG and not command.strip().lower().startswith("insert"): Log.note("Running command\n{{command|limit(100)|indent}}", command=command) show_timing = True with Timer("SQL Timing", silent=not show_timing): if command is COMMIT: self.db.commit() signal.release() elif signal is not None: try: curr = self.db.execute(command) if result is not None: result.meta.format = "table" result.header = [d[0] for d in curr.description] if curr.description else None result.data = curr.fetchall() if DEBUG and result.data: text = convert.table2csv(list(result.data)) Log.note("Result:\n{{data}}", data=text) except Exception as e: e = Except.wrap(e) e.cause = Except( type=ERROR, template="Bad call to Sqlite", trace=trace ) if result is None: Log.error("Problem with\n{{command|indent}}", command=command, cause=e) else: result.exception = Except(ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally: if isinstance(signal, Signal): signal.go() else: signal.release() else: try: self.db.execute(command) except Exception as e: e = Except.wrap(e) e.cause = Except( type=ERROR, template="Bad call to Sqlite", trace=trace ) Log.warning("Failure to execute", cause=e) except Exception as e: if not please_stop: Log.warning("Problem with sql thread", cause=e) finally: self.closed = True if DEBUG: Log.note("Database is closed") self.db.close() def _load_functions(self): global _load_extension_warning_sent library_loc = File.new_instance(sys.modules[__name__].__file__, "../..") full_path = File.new_instance(library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath try: trace = extract_stack(0)[0] if self.upgrade: if os.name == 'nt': file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions.so") else: file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions") full_path = file.abspath self.db.enable_load_extension(True) self.db.execute(SQL_SELECT + "load_extension" + sql_iso(quote_value(full_path))) except Exception as e: if not _load_extension_warning_sent: _load_extension_warning_sent = True Log.warning("Could not load {{file}}, doing without. (no SQRT for you!)", file=full_path, cause=e) def create_new_functions(self): def regexp(pattern, item): reg = re.compile(pattern) return reg.search(item) is not None self.db.create_function("REGEXP", 2, regexp)
class Sqlite(DB): """ Allows multi-threaded access Loads extension functions (like SQRT) """ canonical = None def __init__(self, filename=None, db=None, upgrade=True): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread-safe database """ if upgrade and not _upgraded: _upgrade() self.filename = filename self.db = db self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG self.upgrade = upgrade def _enhancements(self): def regex(pattern, value): return 1 if re.match(pattern+"$", value) else 0 con = self.db.create_function("regex", 2, regex) class Percentile(object): def __init__(self, percentile): self.percentile=percentile self.acc=[] def step(self, value): self.acc.append(value) def finalize(self): return percentile(self.acc, self.percentile) con.create_aggregate("percentile", 2, Percentile) def execute(self, command): """ COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS :param command: COMMAND FOR SQLITE :return: None """ if DEBUG: # EXECUTE IMMEDIATELY FOR BETTER STACK TRACE return self.query(command) if self.get_trace: trace = extract_stack(1) else: trace = None self.queue.add((command, None, None, trace)) def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ if not self.worker: self.worker = Thread.run("sqlite db thread", self._worker) signal = Signal() result = Data() self.queue.add((command, result, signal, None)) signal.wait() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def _worker(self, please_stop): global _load_extension_warning_sent if DEBUG: Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version) if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(coalesce(self.filename, ':memory:')) library_loc = File.new_instance(sys.modules[__name__].__file__, "../..") full_path = File.new_instance(library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath try: trace = extract_stack(0)[0] if self.upgrade: if os.name == 'nt': file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions.so") else: file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions") full_path = file.abspath self.db.enable_load_extension(True) self.db.execute("SELECT load_extension(" + self.quote_value(full_path) + ")") except Exception as e: if not _load_extension_warning_sent: _load_extension_warning_sent = True Log.warning("Could not load {{file}}}, doing without. (no SQRT for you!)", file=full_path, cause=e) try: while not please_stop: command, result, signal, trace = self.queue.pop(till=please_stop) if DEBUG_INSERT and command.strip().lower().startswith("insert"): Log.note("Running command\n{{command|indent}}", command=command) if DEBUG and not command.strip().lower().startswith("insert"): Log.note("Running command\n{{command|indent}}", command=command) with Timer("Run command", debug=DEBUG): if signal is not None: try: curr = self.db.execute(command) self.db.commit() result.meta.format = "table" result.header = [d[0] for d in curr.description] if curr.description else None result.data = curr.fetchall() if DEBUG and result.data: text = convert.table2csv(list(result.data)) Log.note("Result:\n{{data}}", data=text) except Exception as e: e = Except.wrap(e) result.exception = Except(ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally: signal.go() else: try: self.db.execute(command) self.db.commit() except Exception as e: e = Except.wrap(e) e.cause = Except( type=ERROR, template="Bad call to Sqlite", trace=trace ) Log.warning("Failure to execute", cause=e) except Exception as e: if not please_stop: Log.error("Problem with sql thread", e) finally: if DEBUG: Log.note("Database is closed") self.db.commit() self.db.close() def quote_column(self, column_name, table=None): return quote_column(column_name, table) def quote_value(self, value): return quote_value(value)
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ @override def __new__(cls, kwargs, *args, **_kwargs): es_cluster = elasticsearch.Cluster(kwargs) output = known_clusters.get(id(es_cluster)) if output is None: output = object.__new__(cls) known_clusters[id(es_cluster)] = output return output @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = Relation_usingList() self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList() self.alias_to_query_paths = { "meta.columns": [['.']], "meta.tables": [['.']] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [ # TableDesc("meta.columns", None, ".", Date.now()), # TableDesc("meta.tables", None, ".", Date.now()) ], jx_base.Schema(".", table_columns) ) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def namespace(self): return self.meta.columns.namespace @property def url(self): return self.es_cluster.url / self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata(force=update_required) indexes = self.index_to_alias.get_domain(alias) props = [ (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if i in indexes for t, m in [_get_best_type_from_mapping(d.mappings)] ] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(alias, mapping, meta) table_desc.timestamp = es_last_updated def _parse_properties(self, alias, mapping, meta): abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) if any(c.cardinality == 0 and c.names['.'] != '_id' for c in abs_columns): Log.warning( "Some columns are not stored {{names}}", names=[ ".".join((c.es_index, c.names['.'])) for c in abs_columns if c.cardinality == 0 ] ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(SELF_PATH) query_paths.append(ROOT_PATH) self.alias_to_query_paths[alias] = query_paths for i in self.index_to_alias.get_domain(alias): self.alias_to_query_paths[i] = query_paths # ADD RELATIVE NAMES for abs_column in abs_columns: abs_column.last_updated = None abs_column.jx_type = jx_type(abs_column) for query_path in query_paths: abs_column.names[query_path[0]] = relative_field(abs_column.names["."], query_path[0]) self.todo.add(self.meta.columns.add(abs_column)) pass def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias[name] def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: last_update = MAX([ self.es_cluster.index_last_updated[i] for i in self.index_to_alias.get_domain(alias) ]) table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc( name=alias, url=None, query_path=['.'], timestamp=Date.MIN ) with self.meta.tables.locker: self.meta.tables.add(table) self._reload_columns(table) elif force or table.timestamp < last_update: self._reload_columns(table) columns = self.meta.columns.find(alias, column_name) columns = jx.sort(columns, "names.\\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: if len(columns) > 10: Log.note("waiting for {{num}} columns to update", num=len([c for c in columns if not c.last_updated])) else: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 multi = 1 else: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}} }, "size": 0 }) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[Date(t).format() for t in wrap(old_columns).last_updated] ) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG): if column.es_index in self.index_does_not_exist: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) continue if column.jx_type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE): column.last_updated = Date.now() continue elif column.last_updated >= Date.now()-TOO_OLD: continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) else: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if c.last_updated >= Date.now()-TOO_OLD: continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": c}, silent=not DEBUG, too_long=0.05): self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) def get_table(self, name): if name == "meta.columns": return self.meta.columns # return self.meta.columns with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema query_path = split_field(name) root, rest = query_path[0], join_field(query_path[1:]) return self.get_snowflake(root).get_schema(rest)
class Celery(object): _fixups = None _pool = None def __init__( self, name, broker=None, include=None, **kwargs ): self.Task = MethodCaller self.name = name self.request_queue = Queue(name=name+" requests") self.response_queue = Queue(name=name+" responses") self.kwargs = kwargs self.include = include self.broker = broker self._config = {} self._tasks = {} self.on_init() self.response_worker = Thread.run("response worker", self._response_worker) self.responses = {} self.responses_lock = Lock() self.id_lock = Lock() self.next_id = 1 self.worker = Worker(self.request_queue, self.response_queue, celery=self) def _response_worker(self, please_stop): while not please_stop: try: encoded_mail = self.response_queue.pop(till=please_stop) mail = json2value(encoded_mail) Log.note("got response for {{id}}", id=mail.request.id) with self.responses_lock: try: async_response = self.responses[mail.request.id] async_response.mail = set_default(mail, async_response.mail) if mail.status in states.READY_STATES: async_response._ready.go() except Exception as e: Log.warning("not expected", cause=e) except Exception as e: Log.warning("not expected", cause=e) def __enter__(self): return self def __exit__(self, *exc_info): self.close() def close(self): self.response_worker.stop() def on_init(self): """Optional callback called at init.""" pass def start(self, argv=None): pass @property def conf(self): return self._config def task(self, **opts): opts = wrap(opts) this = self def dec(fun): # GET THE PARAMETER NAMES FOR args arg_names = fun.func_code.co_varnames[:fun.func_code.co_argcount] if arg_names and arg_names[0] == 'self': arg_names = arg_names[1:] this._tasks[opts.name] = fun def async(args, kwargs=None, *_args, **_kwargs): kwargs = set_default(kwargs, dict(zip(arg_names, args))) with self.id_lock: id = self.next_id self.next_id += 1 mail = deepcopy(Data( status=states.PENDING, caller={ # "stack": extract_stack(1) }, sender=set_default(_kwargs, opts), message=kwargs, request=set_default({"id": id}) )) output = AsyncResult(id, mail=mail, app=self) with self.responses_lock: self.responses[id] = output self.request_queue.add(value2json(mail)) Log.note("Added {{id}} ({{name}}) to request queue\n{{request}}", id=id, name=opts.name, request=mail) return output def send_message(*args, **kwargs): return async(args, kwargs) def revoke(terminate=True, signal='SIGINT'): pass setattr(send_message, "delay", send_message) setattr(send_message, "revoke", revoke) setattr(send_message, "apply_async", async) return send_message return dec def get_result(self, id): with self.responses_lock: response, self.responses[id] = self.responses[id], None response._ready.wait() if response.status in states.EXCEPTION_STATES: Log.error("bad response", cause=response.mail.result) else: return response.mail.result
class Sqlite(DB): """ Allows multi-threaded access Loads extension functions (like SQRT) """ @override def __init__( self, filename=None, db=None, get_trace=None, upgrade=False, load_functions=False, debug=False, kwargs=None, ): """ :param filename: FILE TO USE FOR DATABASE :param db: AN EXISTING sqlite3 DB YOU WOULD LIKE TO USE (INSTEAD OF USING filename) :param get_trace: GET THE STACK TRACE AND THREAD FOR EVERY DB COMMAND (GOOD FOR DEBUGGING) :param upgrade: REPLACE PYTHON sqlite3 DLL WITH MORE RECENT ONE, WITH MORE FUNCTIONS (NOT WORKING) :param load_functions: LOAD EXTENDED MATH FUNCTIONS (MAY REQUIRE upgrade) :param kwargs: """ global _upgraded global _sqlite3 self.settings = kwargs if not _upgraded: if upgrade: _upgrade() _upgraded = True import sqlite3 as _sqlite3 _ = _sqlite3 self.filename = File(filename).abspath if filename else None if known_databases.get(self.filename): Log.error( "Not allowed to create more than one Sqlite instance for {{file}}", file=self.filename, ) self.debug = debug | DEBUG # SETUP DATABASE self.debug and Log.note("Sqlite version {{version}}", version=_sqlite3.sqlite_version) try: if db == None: self.db = _sqlite3.connect( database=coalesce(self.filename, ":memory:"), check_same_thread=False, isolation_level=None, ) else: self.db = db except Exception as e: Log.error("could not open file {{filename}}", filename=self.filename, cause=e) self.upgrade = upgrade load_functions and self._load_functions() self.locker = Lock() self.available_transactions = [ ] # LIST OF ALL THE TRANSACTIONS BEING MANAGED self.queue = Queue( "sql commands" ) # HOLD (command, result, signal, stacktrace) TUPLES self.get_trace = coalesce(get_trace, TRACE) self.closed = False # WORKER VARIABLES self.transaction_stack = [ ] # THE TRANSACTION OBJECT WE HAVE PARTIALLY RUN self.last_command_item = ( None ) # USE THIS TO HELP BLAME current_transaction FOR HANGING ON TOO LONG self.too_long = None self.delayed_queries = [] self.delayed_transactions = [] self.worker = Thread.run("sqlite db thread", self._worker) self.debug and Log.note( "Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0], ) def _enhancements(self): def regex(pattern, value): return 1 if re.match(pattern + "$", value) else 0 con = self.db.create_function("regex", 2, regex) class Percentile(object): def __init__(self, percentile): self.percentile = percentile self.acc = [] def step(self, value): self.acc.append(value) def finalize(self): return percentile(self.acc, self.percentile) con.create_aggregate("percentile", 2, Percentile) def transaction(self): thread = Thread.current() parent = None with self.locker: for t in self.available_transactions: if t.thread is thread: parent = t output = Transaction(self, parent=parent, thread=thread) self.available_transactions.append(output) return output def about(self, table_name): """ :param table_name: TABLE IF INTEREST :return: SOME INFORMATION ABOUT THE TABLE (cid, name, dtype, notnull, dfft_value, pk) tuples """ details = self.query("PRAGMA table_info" + sql_iso(quote_column(table_name))) return details.data def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ if self.closed: Log.error("database is closed") signal = _allocate_lock() signal.acquire() result = Data() trace = get_stacktrace(1) if self.get_trace else None if self.get_trace: current_thread = Thread.current() with self.locker: for t in self.available_transactions: if t.thread is current_thread: Log.error(DOUBLE_TRANSACTION_ERROR) self.queue.add(CommandItem(command, result, signal, trace, None)) signal.acquire() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def close(self): """ OPTIONAL COMMIT-AND-CLOSE IF THIS IS NOT DONE, THEN THE THREAD THAT SPAWNED THIS INSTANCE :return: """ self.closed = True signal = _allocate_lock() signal.acquire() self.queue.add(CommandItem(COMMIT, None, signal, None, None)) signal.acquire() self.worker.please_stop.go() return def __enter__(self): pass def __exit__(self, exc_type, exc_val, exc_tb): self.close() def _load_functions(self): global _load_extension_warning_sent library_loc = File.new_instance(sys.modules[__name__].__file__, "../..") full_path = File.new_instance( library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath try: trace = get_stacktrace(0)[0] if self.upgrade: if os.name == "nt": file = File.new_instance( trace["file"], "../../vendor/sqlite/libsqlitefunctions.so") else: file = File.new_instance( trace["file"], "../../vendor/sqlite/libsqlitefunctions") full_path = file.abspath self.db.enable_load_extension(True) self.db.execute( text(SQL_SELECT + "load_extension" + sql_iso(quote_value(full_path)))) except Exception as e: if not _load_extension_warning_sent: _load_extension_warning_sent = True Log.warning( "Could not load {{file}}, doing without. (no SQRT for you!)", file=full_path, cause=e, ) def create_new_functions(self): def regexp(pattern, item): reg = re.compile(pattern) return reg.search(item) is not None self.db.create_function("REGEXP", 2, regexp) def show_transactions_blocked_warning(self): blocker = self.last_command_item blocked = (self.delayed_queries + self.delayed_transactions)[0] Log.warning( "Query on thread {{blocked_thread|json}} at\n" "{{blocked_trace|indent}}" "is blocked by {{blocker_thread|json}} at\n" "{{blocker_trace|indent}}" "this message brought to you by....", blocker_trace=format_trace(blocker.trace), blocked_trace=format_trace(blocked.trace), blocker_thread=blocker.transaction.thread.name if blocker.transaction is not None else None, blocked_thread=blocked.transaction.thread.name if blocked.transaction is not None else None, ) def _close_transaction(self, command_item): query, result, signal, trace, transaction = command_item transaction.end_of_life = True with self.locker: self.available_transactions.remove(transaction) assert transaction not in self.available_transactions old_length = len(self.transaction_stack) old_trans = self.transaction_stack[-1] del self.transaction_stack[-1] assert old_length - 1 == len(self.transaction_stack) assert old_trans assert old_trans not in self.transaction_stack if not self.transaction_stack: # NESTED TRANSACTIONS NOT ALLOWED IN sqlite3 self.debug and Log.note(FORMAT_COMMAND, command=query) self.db.execute(query) has_been_too_long = False with self.locker: if self.too_long is not None: self.too_long, too_long = None, self.too_long # WE ARE CHEATING HERE: WE REACH INTO THE Signal MEMBERS AND REMOVE WHAT WE ADDED TO THE INTERNAL job_queue with too_long.lock: has_been_too_long = bool(too_long) too_long.job_queue = None # PUT delayed BACK ON THE QUEUE, IN THE ORDER FOUND, BUT WITH QUERIES FIRST if self.delayed_transactions: for c in reversed(self.delayed_transactions): self.queue.push(c) del self.delayed_transactions[:] if self.delayed_queries: for c in reversed(self.delayed_queries): self.queue.push(c) del self.delayed_queries[:] if has_been_too_long: Log.note("Transaction blockage cleared") def _worker(self, please_stop): try: # MAIN EXECUTION LOOP while not please_stop: command_item = self.queue.pop(till=please_stop) if command_item is None: break try: self._process_command_item(command_item) except Exception as e: Log.warning("worker can not execute command", cause=e) except Exception as e: e = Except.wrap(e) if not please_stop: Log.warning("Problem with sql", cause=e) finally: self.closed = True self.debug and Log.note("Database is closed") self.db.close() def _process_command_item(self, command_item): query, result, signal, trace, transaction = command_item with Timer("SQL Timing", verbose=self.debug): if transaction is None: # THIS IS A TRANSACTIONLESS QUERY, DELAY IT IF THERE IS A CURRENT TRANSACTION if self.transaction_stack: with self.locker: if self.too_long is None: self.too_long = Till( seconds=TOO_LONG_TO_HOLD_TRANSACTION) self.too_long.then( self.show_transactions_blocked_warning) self.delayed_queries.append(command_item) return elif self.transaction_stack and self.transaction_stack[-1] not in [ transaction, transaction.parent, ]: # THIS TRANSACTION IS NOT THE CURRENT TRANSACTION, DELAY IT with self.locker: if self.too_long is None: self.too_long = Till( seconds=TOO_LONG_TO_HOLD_TRANSACTION) self.too_long.then( self.show_transactions_blocked_warning) self.delayed_transactions.append(command_item) return else: # ENSURE THE CURRENT TRANSACTION IS UP TO DATE FOR THIS query if not self.transaction_stack: # sqlite3 ALLOWS ONLY ONE TRANSACTION AT A TIME self.debug and Log.note(FORMAT_COMMAND, command=BEGIN) self.db.execute(BEGIN) self.transaction_stack.append(transaction) elif transaction is not self.transaction_stack[-1]: self.transaction_stack.append(transaction) elif transaction.exception and query is not ROLLBACK: result.exception = Except( context=ERROR, template= "Not allowed to continue using a transaction that failed", cause=transaction.exception, trace=trace, ) signal.release() return try: transaction.do_all() except Exception as e: # DEAL WITH ERRORS IN QUEUED COMMANDS # WE WILL UNWRAP THE OUTER EXCEPTION TO GET THE CAUSE err = Except( context=ERROR, template="Bad call to Sqlite3 while " + FORMAT_COMMAND, params={"command": e.params.current.command}, cause=e.cause, trace=e.params.current.trace, ) transaction.exception = result.exception = err if query in [COMMIT, ROLLBACK]: self._close_transaction( CommandItem(ROLLBACK, result, signal, trace, transaction)) signal.release() return try: # DEAL WITH END-OF-TRANSACTION MESSAGES if query in [COMMIT, ROLLBACK]: self._close_transaction(command_item) return # EXECUTE QUERY self.last_command_item = command_item self.debug and Log.note(FORMAT_COMMAND, command=query) curr = self.db.execute(text(query)) result.meta.format = "table" result.header = ([d[0] for d in curr.description] if curr.description else None) result.data = curr.fetchall() if self.debug and result.data: csv = convert.table2csv(list(result.data)) Log.note("Result:\n{{data|limit(100)|indent}}", data=csv) except Exception as e: e = Except.wrap(e) err = Except( context=ERROR, template="Bad call to Sqlite while " + FORMAT_COMMAND, params={"command": query}, trace=trace, cause=e, ) result.exception = err if transaction: transaction.exception = err finally: signal.release()
class Clogger: # Singleton of the look-ahead scanner Clogger SINGLE_CLOGGER = None def __new__(cls, *args, **kwargs): if cls.SINGLE_CLOGGER is None: cls.SINGLE_CLOGGER = object.__new__(cls) return cls.SINGLE_CLOGGER def __init__(self, conn=None, tuid_service=None, start_workers=True, new_table=False, kwargs=None): try: self.config = kwargs self.conn = conn if conn else sql.Sql(self.config.database.name) self.hg_cache = HgMozillaOrg(kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService( kwargs=self.config.tuid, conn=self.conn, clogger=self ) self.rev_locker = Lock() self.working_locker = Lock() if new_table: with self.conn.transaction() as t: t.execute("DROP TABLE IF EXISTS csetLog") self.init_db() self.next_revnum = coalesce(self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1) self.csets_todo_backwards = Queue(name="Clogger.csets_todo_backwards") self.deletions_todo = Queue(name="Clogger.deletions_todo") self.maintenance_signal = Signal(name="Clogger.maintenance_signal") if 'tuid' in self.config: self.config = self.config.tuid self.disable_backfilling = False self.disable_tipfilling = False self.disable_deletion = False self.disable_maintenance = False self.backfill_thread = None self.tipfill_thread = None self.deletion_thread = None self.maintenance_thread = None # Make sure we are filled before allowing queries numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] if numrevs < MINIMUM_PERMANENT_CSETS: Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS) oldest_rev = 'tip' with self.conn.transaction() as t: tmp = t.query("SELECT min(revnum), revision FROM csetLog").data[0][1] if tmp: oldest_rev = tmp self._fill_in_range( MINIMUM_PERMANENT_CSETS - numrevs, oldest_rev, timestamp=False ) Log.note( "Table is filled with atleast {{minim}} entries.", minim=MINIMUM_PERMANENT_CSETS ) if start_workers: self.start_workers() except Exception as e: Log.warning("Cannot setup clogger: {{cause}}", cause=str(e)) def start_backfilling(self): if not self.backfill_thread: self.backfill_thread = Thread.run('clogger-backfill', self.fill_backward_with_list) def start_tipfillling(self): if not self.tipfill_thread: self.tipfill_thread = Thread.run('clogger-tip', self.fill_forward_continuous) def start_maintenance(self): if not self.maintenance_thread: self.maintenance_thread = Thread.run('clogger-maintenance', self.csetLog_maintenance) def start_deleter(self): if not self.deletion_thread: self.deletion_thread = Thread.run('clogger-deleter', self.csetLog_deleter) def start_workers(self): self.start_tipfillling() self.start_backfilling() self.start_maintenance() self.start_deleter() Log.note("Started clogger workers.") def init_db(self): with self.conn.transaction() as t: t.execute(''' CREATE TABLE IF NOT EXISTS csetLog ( revnum INTEGER PRIMARY KEY, revision CHAR(12) NOT NULL, timestamp INTEGER );''') def disable_all(self): self.disable_tipfilling = True self.disable_backfilling = True self.disable_maintenance = True self.disable_deletion = True def revnum(self): """ :return: max revnum that was added """ return coalesce(self.conn.get_one("SELECT max(revnum) as revnum FROM csetLog")[0], 0) def get_tip(self, transaction): return transaction.get_one( "SELECT max(revnum) as revnum, revision FROM csetLog" ) def get_tail(self, transaction): return transaction.get_one( "SELECT min(revnum) as revnum, revision FROM csetLog" ) def _get_clog(self, clog_url): try: Log.note("Searching through changelog {{url}}", url=clog_url) clog_obj = http.get_json(clog_url, retry=RETRY) return clog_obj except Exception as e: Log.error( "Unexpected error getting changset-log for {{url}}: {{error}}", url=clog_url, error=e ) def _get_one_revision(self, transaction, cset_entry): # Returns a single revision if it exists _, rev, _ = cset_entry return transaction.get_one("SELECT revision FROM csetLog WHERE revision=?", (rev,)) def _get_one_revnum(self, transaction, rev): # Returns a single revnum if it exists return transaction.get_one("SELECT revnum FROM csetLog WHERE revision=?", (rev,)) def _get_revnum_range(self, transaction, revnum1, revnum2): # Returns a range of revision numbers (that is inclusive) high_num = max(revnum1, revnum2) low_num = min(revnum1, revnum2) return transaction.query( "SELECT revnum, revision FROM csetLog WHERE " "revnum >= " + str(low_num) + " AND revnum <= " + str(high_num) ).data def recompute_table_revnums(self): ''' Recomputes the revnums for the csetLog table by creating a new table, and copying csetLog to it. The INTEGER PRIMARY KEY in the temp table auto increments as rows are added. IMPORTANT: Only call this after acquiring the lock `self.working_locker`. :return: ''' with self.conn.transaction() as t: t.execute(''' CREATE TABLE temp ( revnum INTEGER PRIMARY KEY, revision CHAR(12) NOT NULL, timestamp INTEGER );''') t.execute( "INSERT INTO temp (revision, timestamp) " "SELECT revision, timestamp FROM csetlog ORDER BY revnum ASC" ) t.execute("DROP TABLE csetLog;") t.execute("ALTER TABLE temp RENAME TO csetLog;") def check_for_maintenance(self): ''' Returns True if the maintenance worker should be run now, and False otherwise. :return: ''' numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] Log.note("Number of csets in csetLog table: {{num}}", num=numrevs) if numrevs >= SIGNAL_MAINTENANCE_CSETS: return True return False def add_cset_entries(self, ordered_rev_list, timestamp=False, number_forward=True): ''' Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered based on how changesets are found in the changelog. Going forwards or backwards is dealt with by flipping the list :param ordered_cset_list: Order given from changeset log searching. :param timestamp: If false, records are kept indefinitely but if holes exist: (delete, None, delete, None) those delete's with None's around them will not be deleted. :param numbered: If True, this function will number the revision list by going forward from max(revNum), else it'll go backwards from revNum, then add X to all revnums and self.next_revnum where X is the length of ordered_rev_list :return: ''' with self.conn.transaction() as t: current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0] current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0] if not current_min or not current_max: current_min = 0 current_max = 0 direction = -1 start = current_min - 1 if number_forward: direction = 1 start = current_max + 1 ordered_rev_list = ordered_rev_list[::-1] insert_list = [ ( start + direction * count, rev, int(time.time()) if timestamp else -1 ) for count, rev in enumerate(ordered_rev_list) ] # In case of overlapping requests fmt_insert_list = [] for cset_entry in insert_list: tmp = self._get_one_revision(t, cset_entry) if not tmp: fmt_insert_list.append(cset_entry) for _, tmp_insert_list in jx.groupby(fmt_insert_list, size=SQL_CSET_BATCH_SIZE): t.execute( "INSERT INTO csetLog (revnum, revision, timestamp)" + " VALUES " + sql_list( quote_set((revnum, revision, timestamp)) for revnum, revision, timestamp in tmp_insert_list ) ) # Move the revision numbers forward if needed self.recompute_table_revnums() # Start a maintenance run if needed if self.check_for_maintenance(): Log.note("Scheduling maintenance run on clogger.") self.maintenance_signal.go() def _fill_in_range(self, parent_cset, child_cset, timestamp=False, number_forward=True): ''' Fills cset logs in a certain range. 'parent_cset' can be an int and in that case, we get that many changesets instead. If parent_cset is an int, then we consider that we are going backwards (number_forward is False) and we ignore the first changeset of the first log, and we ignore the setting for number_forward. Otherwise, we continue until we find the given 'parent_cset'. :param parent_cset: :param child_cset: :param timestamp: :param number_forward: :return: ''' csets_to_add = [] found_parent = False find_parent = False if type(parent_cset) != int: find_parent = True elif parent_cset >= MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG: Log.warning( "Requested number of new changesets {{num}} is too high. " "Max number that can be requested is {{maxnum}}.", num=parent_cset, maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG ) return None csets_found = 0 clogs_seen = 0 final_rev = child_cset while not found_parent and clogs_seen < MAX_BACKFILL_CLOGS: clog_url = str(HG_URL) + "/" + self.config.hg.branch + "/json-log/" + final_rev clog_obj = self._get_clog(clog_url) clog_csets_list = list(clog_obj['changesets']) for clog_cset in clog_csets_list[:-1]: if not number_forward and csets_found <= 0: # Skip this entry it already exists csets_found += 1 continue nodes_cset = clog_cset['node'][:12] if find_parent: if nodes_cset == parent_cset: found_parent = True if not number_forward: # When going forward this entry is # the given parent csets_to_add.append(nodes_cset) break else: if csets_found + 1 > parent_cset: found_parent = True if not number_forward: # When going forward this entry is # the given parent (which is supposed # to already exist) csets_to_add.append(nodes_cset) break csets_found += 1 csets_to_add.append(nodes_cset) if found_parent == True: break clogs_seen += 1 final_rev = clog_csets_list[-1]['node'][:12] if found_parent: self.add_cset_entries(csets_to_add, timestamp=timestamp, number_forward=number_forward) else: Log.warning( "Couldn't find the end of the request for {{request}}. " "Max number that can be requested through _fill_in_range is {{maxnum}}.", request={ 'parent_cset': parent_cset, 'child_cset':child_cset, 'number_forward': number_forward }, maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG ) return None return csets_to_add def initialize_to_range(self, old_rev, new_rev, delete_old=True): ''' Used in service testing to get to very old changesets quickly. :param old_rev: The oldest revision to keep :param new_rev: The revision to start searching from :return: ''' old_settings = [ self.disable_tipfilling, self.disable_backfilling, self.disable_maintenance, self.disable_deletion ] self.disable_tipfilling = True self.disable_backfilling = True self.disable_maintenance = True self.disable_deletion = True old_rev = old_rev[:12] new_rev = new_rev[:12] with self.working_locker: if delete_old: with self.conn.transaction() as t: t.execute("DELETE FROM csetLog") with self.conn.transaction() as t: t.execute( "INSERT INTO csetLog (revision, timestamp) VALUES " + quote_set((new_rev, -1)) ) self._fill_in_range(old_rev, new_rev, timestamp=True, number_forward=False) self.disable_tipfilling = old_settings[0] self.disable_backfilling = old_settings[1] self.disable_maintenance = old_settings[2] self.disable_deletion = old_settings[3] def fill_backward_with_list(self, please_stop=None): ''' Expects requests of the tuple form: (parent_cset, timestamp) parent_cset can be an int X to go back by X changesets, or a string to search for going backwards in time. If timestamp is false, no timestamps will be added to the entries. :param please_stop: :return: ''' while not please_stop: try: request = self.csets_todo_backwards.pop(till=please_stop) if please_stop: break # If backfilling is disabled, all requests # are ignored. if self.disable_backfilling: Till(till=CSET_BACKFILL_WAIT_TIME).wait() continue if request: parent_cset, timestamp = request else: continue with self.working_locker: with self.conn.transaction() as t: parent_revnum = self._get_one_revnum(t, parent_cset) if parent_revnum: continue with self.conn.transaction() as t: _, oldest_revision = self.get_tail(t) self._fill_in_range( parent_cset, oldest_revision, timestamp=timestamp, number_forward=False ) Log.note("Finished {{cset}}", cset=parent_cset) except Exception as e: Log.warning("Unknown error occurred during backfill: ", cause=e) def update_tip(self): ''' Returns False if the tip is already at the newest, or True if an update has taken place. :return: ''' clog_obj = self._get_clog( str(HG_URL) + "/" + self.config.hg.branch + "/json-log/tip" ) # Get current tip in DB with self.conn.transaction() as t: _, newest_known_rev = self.get_tip(t) # If we are still at the newest, wait for CSET_TIP_WAIT_TIME seconds # before checking again. first_clog_entry = clog_obj['changesets'][0]['node'][:12] if newest_known_rev == first_clog_entry: return False csets_to_gather = None if not newest_known_rev: Log.note( "No revisions found in table, adding {{minim}} entries...", minim=MINIMUM_PERMANENT_CSETS ) csets_to_gather = MINIMUM_PERMANENT_CSETS found_newest_known = False csets_to_add = [] csets_found = 0 clogs_seen = 0 Log.note("Found new revisions. Updating csetLog tip to {{rev}}...", rev=first_clog_entry) while not found_newest_known and clogs_seen < MAX_TIPFILL_CLOGS: clog_csets_list = list(clog_obj['changesets']) for clog_cset in clog_csets_list[:-1]: nodes_cset = clog_cset['node'][:12] if not csets_to_gather: if nodes_cset == newest_known_rev: found_newest_known = True break else: if csets_found >= csets_to_gather: found_newest_known = True break csets_found += 1 csets_to_add.append(nodes_cset) if not found_newest_known: # Get the next page clogs_seen += 1 final_rev = clog_csets_list[-1]['node'][:12] clog_url = str(HG_URL) + "/" + self.config.hg.branch + "/json-log/" + final_rev clog_obj = self._get_clog(clog_url) if clogs_seen >= MAX_TIPFILL_CLOGS: Log.error( "Too many changesets, can't find last tip or the number is too high: {{rev}}. " "Maximum possible to request is {{maxnum}}", rev=coalesce(newest_known_rev, csets_to_gather), maxnum=MAX_TIPFILL_CLOGS * CHANGESETS_PER_CLOG ) return False with self.working_locker: Log.note("Adding {{csets}}", csets=csets_to_add) self.add_cset_entries(csets_to_add, timestamp=False) return True def fill_forward_continuous(self, please_stop=None): while not please_stop: try: while not please_stop and not self.disable_tipfilling and self.update_tip(): pass (please_stop | Till(seconds=CSET_TIP_WAIT_TIME)).wait() except Exception as e: Log.warning("Unknown error occurred during tip filling:", cause=e) def csetLog_maintenance(self, please_stop=None): ''' Handles deleting old csetLog entries and timestamping revisions once they pass the length for permanent storage for deletion later. :param please_stop: :return: ''' while not please_stop: try: # Wait until something signals the maintenance cycle # to begin (or end). (self.maintenance_signal | please_stop).wait() if please_stop: break if self.disable_maintenance: continue Log.warning( "Starting clog maintenance. Since this doesn't start often, " "we need to explicitly see when it's started with this warning." ) # Reset signal so we don't request # maintenance infinitely. with self.maintenance_signal.lock: self.maintenance_signal._go = False with self.working_locker: all_data = None with self.conn.transaction() as t: all_data = sorted( t.get("SELECT revnum, revision, timestamp FROM csetLog"), key=lambda x: int(x[0]) ) # Restore maximum permanents (if overflowing) new_data = [] modified = False for count, (revnum, revision, timestamp) in enumerate(all_data[::-1]): if count < MINIMUM_PERMANENT_CSETS: if timestamp != -1: modified = True new_data.append((revnum, revision, -1)) else: new_data.append((revnum, revision, timestamp)) elif type(timestamp) != int or timestamp == -1: modified = True new_data.append((revnum, revision, int(time.time()))) else: new_data.append((revnum, revision, timestamp)) # Delete annotations at revisions with timestamps # that are too old. The csetLog entries will have # their timestamps reset here. new_data1 = [] annrevs_to_del = [] current_time = time.time() for count, (revnum, revision, timestamp) in enumerate(new_data[::-1]): new_timestamp = timestamp if timestamp != -1: if current_time >= timestamp + TIME_TO_KEEP_ANNOTATIONS.seconds: modified = True new_timestamp = current_time annrevs_to_del.append(revision) new_data1.append((revnum, revision, new_timestamp)) if len(annrevs_to_del) > 0: # Delete any latestFileMod and annotation entries # that are too old. Log.note( "Deleting annotations and latestFileMod for revisions for being " "older than {{oldest}}: {{revisions}}", oldest=TIME_TO_KEEP_ANNOTATIONS, revisions=annrevs_to_del ) with self.conn.transaction() as t: t.execute( "DELETE FROM latestFileMod WHERE revision IN " + quote_set(annrevs_to_del) ) t.execute( "DELETE FROM annotations WHERE revision IN " + quote_set(annrevs_to_del) ) # Delete any overflowing entries new_data2 = new_data1 reved_all_data = all_data[::-1] deleted_data = reved_all_data[MAXIMUM_NONPERMANENT_CSETS:] delete_overflowing_revstart = None if len(deleted_data) > 0: _, delete_overflowing_revstart, _ = deleted_data[0] new_data2 = set(all_data) - set(deleted_data) # Update old frontiers if requested, otherwise # they will all get deleted by the csetLog_deleter # worker if UPDATE_VERY_OLD_FRONTIERS: _, max_revision, _ = all_data[-1] for _, revision, _ in deleted_data: with self.conn.transaction() as t: old_files = t.get( "SELECT file FROM latestFileMod WHERE revision=?", (revision,) ) if old_files is None or len(old_files) <= 0: continue self.tuid_service.get_tuids_from_files( old_files, max_revision, going_forward=True, ) still_exist = True while still_exist and not please_stop: Till(seconds=TUID_EXISTENCE_WAIT_TIME).wait() with self.conn.transaction() as t: old_files = t.get( "SELECT file FROM latestFileMod WHERE revision=?", (revision,) ) if old_files is None or len(old_files) <= 0: still_exist = False # Update table and schedule a deletion if modified: with self.conn.transaction() as t: insert_into_db_chunked( t, new_data2, "INSERT OR REPLACE INTO csetLog (revnum, revision, timestamp) VALUES " ) if not deleted_data: continue Log.note("Scheduling {{num_csets}} for deletion", num_csets=len(deleted_data)) self.deletions_todo.add(delete_overflowing_revstart) except Exception as e: Log.warning("Unexpected error occured while maintaining csetLog, continuing to try: ", cause=e) return def csetLog_deleter(self, please_stop=None): ''' Deletes changesets from the csetLog table and also changesets from the annotation table that have revisions matching the given changesets. Accepts lists of csets from self.deletions_todo. :param please_stop: :return: ''' while not please_stop: try: request = self.deletions_todo.pop(till=please_stop) if please_stop: break # If deletion is disabled, ignore the current # request - it will need to be re-requested. if self.disable_deletion: Till(till=CSET_DELETION_WAIT_TIME).wait() continue with self.working_locker: first_cset = request # Since we are deleting and moving stuff around in the # TUID tables, we need everything to be contained in # one transaction with no interruptions. with self.conn.transaction() as t: revnum = self._get_one_revnum(t, first_cset)[0] csets_to_del = t.get( "SELECT revnum, revision FROM csetLog WHERE revnum <= ?", (revnum,) ) csets_to_del = [cset for _, cset in csets_to_del] existing_frontiers = t.query( "SELECT revision FROM latestFileMod WHERE revision IN " + quote_set(csets_to_del) ).data existing_frontiers = [existing_frontiers[i][0] for i, _ in enumerate(existing_frontiers)] Log.note( "Deleting all annotations and changeset log entries with revisions in the list: {{csets}}", csets=csets_to_del ) if len(existing_frontiers) > 0: # This handles files which no longer exist anymore in # the main branch. Log.note( "Deleting existing frontiers for revisions: {{revisions}}", revisions=existing_frontiers ) t.execute( "DELETE FROM latestFileMod WHERE revision IN " + quote_set(existing_frontiers) ) Log.note("Deleting annotations...") t.execute( "DELETE FROM annotations WHERE revision IN " + quote_set(csets_to_del) ) Log.note( "Deleting {{num_entries}} csetLog entries...", num_entries=len(csets_to_del) ) t.execute( "DELETE FROM csetLog WHERE revision IN " + quote_set(csets_to_del) ) # Recalculate the revnums self.recompute_table_revnums() except Exception as e: Log.warning("Unexpected error occured while deleting from csetLog:", cause=e) Till(seconds=CSET_DELETION_WAIT_TIME).wait() return def get_old_cset_revnum(self, revision): self.csets_todo_backwards.add((revision, True)) revnum = None timeout = Till(seconds=BACKFILL_REVNUM_TIMEOUT) while not timeout: with self.conn.transaction() as t: revnum = self._get_one_revnum(t, revision) if revnum and revnum[0] >= 0: break elif revnum[0] < 0: Log.note("Waiting for table to recompute...") else: Log.note("Waiting for backfill to complete...") Till(seconds=CSET_BACKFILL_WAIT_TIME).wait() if timeout: Log.error( "Cannot find revision {{rev}} after waiting {{timeout}} seconds", rev=revision, timeout=BACKFILL_REVNUM_TIMEOUT ) return revnum def get_revnnums_from_range(self, revision1, revision2): with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) revnum2 = self._get_one_revnum(t, revision2) if not revnum1 or not revnum2: did_an_update = self.update_tip() if did_an_update: with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) revnum2 = self._get_one_revnum(t, revision2) if not revnum1: revnum1 = self.get_old_cset_revnum(revision1) # Refresh the second entry with self.conn.transaction() as t: revnum2 = self._get_one_revnum(t, revision2) if not revnum2: revnum2 = self.get_old_cset_revnum(revision2) # The first revnum might change also with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) with self.conn.transaction() as t: result = self._get_revnum_range(t, revnum1[0], revnum2[0]) return sorted( result, key=lambda x: int(x[0]) )
class Cache(object): """ For Caching hg.mo requests """ @override def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None): self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD) self.rate = coalesce(rate, HG_REQUEST_PER_SECOND) self.cache_locker = Lock() self.cache = {} # MAP FROM url TO (ready, headers, response, timestamp) PAIR self.no_cache = {} # VERY SHORT TERM CACHE self.workers = [] self.todo = Queue(APP_NAME+" todo") self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds)) self.url = URL(source.url) self.db = Sqlite(database) self.inbound_rate = RateLogger("Inbound") self.outbound_rate = RateLogger("hg.mo") if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data: with self.db.transaction() as t: t.execute( "CREATE TABLE cache (" " path TEXT PRIMARY KEY, " " headers TEXT, " " response TEXT, " " timestamp REAL " ")" ) self.threads = [ Thread.run(APP_NAME+" worker" + text_type(i), self._worker) for i in range(CONCURRENCY) ] self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter) self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner) def _rate_limiter(self, please_stop): try: max_requests = self.requests.max recent_requests = [] while not please_stop: now = Date.now() too_old = now - self.amortization_period recent_requests = [t for t in recent_requests if t > too_old] num_recent = len(recent_requests) if num_recent >= max_requests: space_free_at = recent_requests[0] + self.amortization_period (please_stop | Till(till=space_free_at.unix)).wait() continue for _ in xrange(num_recent, max_requests): request = self.todo.pop() now = Date.now() recent_requests.append(now) self.requests.add(request) except Exception as e: Log.warning("failure", cause=e) def _cache_cleaner(self, please_stop): while not please_stop: now = Date.now() too_old = now-CACHE_RETENTION remove = set() with self.cache_locker: for path, (ready, headers, response, timestamp) in self.cache: if timestamp < too_old: remove.add(path) for r in remove: del self.cache[r] (please_stop | Till(seconds=CACHE_RETENTION.seconds / 2)).wait() def please_cache(self, path): """ :return: False if `path` is not to be cached """ if path.endswith("/tip"): return False if any(k in path for k in ["/json-annotate/", "/json-info/", "/json-log/", "/json-rev/", "/rev/", "/raw-rev/", "/raw-file/", "/json-pushes", "/pushloghtml", "/file/"]): return True return False def request(self, method, path, headers): now = Date.now() self.inbound_rate.add(now) ready = Signal(path) # TEST CACHE with self.cache_locker: pair = self.cache.get(path) if pair is None: self.cache[path] = (ready, None, None, now) if pair is not None: # REQUEST IS IN THE QUEUE ALREADY, WAIT ready, headers, response, then = pair if response is None: ready.wait() with self.cache_locker: ready, headers, response, timestamp = self.cache.get(path) with self.db.transaction() as t: t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now)) return Response( response, status=200, headers=json.loads(headers) ) # TEST DB db_response = self.db.query("SELECT headers, response FROM cache WHERE path=" + quote_value(path)).data if db_response: headers, response = db_response[0] with self.db.transaction() as t: t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now)) with self.cache_locker: self.cache[path] = (ready, headers, response.encode('latin1'), now) ready.go() return Response( response, status=200, headers=json.loads(headers) ) # MAKE A NETWORK REQUEST self.todo.add((ready, method, path, headers, now)) ready.wait() with self.cache_locker: ready, headers, response, timestamp = self.cache[path] return Response( response, status=200, headers=json.loads(headers) ) def _worker(self, please_stop): while not please_stop: pair = self.requests.pop(till=please_stop) if please_stop: break ready, method, path, req_headers, timestamp = pair try: url = self.url / path self.outbound_rate.add(Date.now()) response = http.request(method, url, req_headers) del response.headers['transfer-encoding'] resp_headers = value2json(response.headers) resp_content = response.raw.read() please_cache = self.please_cache(path) if please_cache: with self.db.transaction() as t: t.execute("INSERT INTO cache (path, headers, response, timestamp) VALUES" + quote_list((path, resp_headers, resp_content.decode('latin1'), timestamp))) with self.cache_locker: self.cache[path] = (ready, resp_headers, resp_content, timestamp) except Exception as e: Log.warning("problem with request to {{path}}", path=path, cause=e) with self.cache_locker: ready, headers, response = self.cache[path] del self.cache[path] finally: ready.go()
class Clogger: # Singleton of the look-ahead scanner Clogger SINGLE_CLOGGER = None def __new__(cls, *args, **kwargs): if cls.SINGLE_CLOGGER is None: cls.SINGLE_CLOGGER = object.__new__(cls) return cls.SINGLE_CLOGGER def __init__(self, conn=None, tuid_service=None, start_workers=True, new_table=False, kwargs=None): try: self.config = kwargs self.conn = conn if conn else sql.Sql(self.config.database.name) self.hg_cache = HgMozillaOrg( kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService( kwargs=self.config.tuid, conn=self.conn, clogger=self) self.rev_locker = Lock() self.working_locker = Lock() if new_table: with self.conn.transaction() as t: t.execute("DROP TABLE IF EXISTS csetLog") self.init_db() self.next_revnum = coalesce( self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1) self.csets_todo_backwards = Queue( name="Clogger.csets_todo_backwards") self.deletions_todo = Queue(name="Clogger.deletions_todo") self.maintenance_signal = Signal(name="Clogger.maintenance_signal") if 'tuid' in self.config: self.config = self.config.tuid self.disable_backfilling = False self.disable_tipfilling = False self.disable_deletion = False self.disable_maintenance = False self.backfill_thread = None self.tipfill_thread = None self.deletion_thread = None self.maintenance_thread = None # Make sure we are filled before allowing queries numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] if numrevs < MINIMUM_PERMANENT_CSETS: Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS) oldest_rev = 'tip' with self.conn.transaction() as t: tmp = t.query( "SELECT min(revnum), revision FROM csetLog").data[0][1] if tmp: oldest_rev = tmp self._fill_in_range(MINIMUM_PERMANENT_CSETS - numrevs, oldest_rev, timestamp=False) Log.note("Table is filled with atleast {{minim}} entries.", minim=MINIMUM_PERMANENT_CSETS) if start_workers: self.start_workers() except Exception as e: Log.warning("Cannot setup clogger: {{cause}}", cause=str(e)) def start_backfilling(self): if not self.backfill_thread: self.backfill_thread = Thread.run('clogger-backfill', self.fill_backward_with_list) def start_tipfillling(self): if not self.tipfill_thread: self.tipfill_thread = Thread.run('clogger-tip', self.fill_forward_continuous) def start_maintenance(self): if not self.maintenance_thread: self.maintenance_thread = Thread.run('clogger-maintenance', self.csetLog_maintenance) def start_deleter(self): if not self.deletion_thread: self.deletion_thread = Thread.run('clogger-deleter', self.csetLog_deleter) def start_workers(self): self.start_tipfillling() self.start_backfilling() self.start_maintenance() self.start_deleter() Log.note("Started clogger workers.") def init_db(self): with self.conn.transaction() as t: t.execute(''' CREATE TABLE IF NOT EXISTS csetLog ( revnum INTEGER PRIMARY KEY, revision CHAR(12) NOT NULL, timestamp INTEGER );''') def disable_all(self): self.disable_tipfilling = True self.disable_backfilling = True self.disable_maintenance = True self.disable_deletion = True def revnum(self): """ :return: max revnum that was added """ return coalesce( self.conn.get_one("SELECT max(revnum) as revnum FROM csetLog")[0], 0) def get_tip(self, transaction): return transaction.get_one( "SELECT max(revnum) as revnum, revision FROM csetLog") def get_tail(self, transaction): return transaction.get_one( "SELECT min(revnum) as revnum, revision FROM csetLog") def _get_clog(self, clog_url): try: Log.note("Searching through changelog {{url}}", url=clog_url) clog_obj = http.get_json(clog_url, retry=RETRY) return clog_obj except Exception as e: Log.error( "Unexpected error getting changset-log for {{url}}: {{error}}", url=clog_url, error=e) def _get_one_revision(self, transaction, cset_entry): # Returns a single revision if it exists _, rev, _ = cset_entry return transaction.get_one( "SELECT revision FROM csetLog WHERE revision=?", (rev, )) def _get_one_revnum(self, transaction, rev): # Returns a single revnum if it exists return transaction.get_one( "SELECT revnum FROM csetLog WHERE revision=?", (rev, )) def _get_revnum_range(self, transaction, revnum1, revnum2): # Returns a range of revision numbers (that is inclusive) high_num = max(revnum1, revnum2) low_num = min(revnum1, revnum2) return transaction.query("SELECT revnum, revision FROM csetLog WHERE " "revnum >= " + str(low_num) + " AND revnum <= " + str(high_num)).data def recompute_table_revnums(self): ''' Recomputes the revnums for the csetLog table by creating a new table, and copying csetLog to it. The INTEGER PRIMARY KEY in the temp table auto increments as rows are added. IMPORTANT: Only call this after acquiring the lock `self.working_locker`. :return: ''' with self.conn.transaction() as t: t.execute(''' CREATE TABLE temp ( revnum INTEGER PRIMARY KEY, revision CHAR(12) NOT NULL, timestamp INTEGER );''') t.execute( "INSERT INTO temp (revision, timestamp) " "SELECT revision, timestamp FROM csetlog ORDER BY revnum ASC") t.execute("DROP TABLE csetLog;") t.execute("ALTER TABLE temp RENAME TO csetLog;") def check_for_maintenance(self): ''' Returns True if the maintenance worker should be run now, and False otherwise. :return: ''' numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] Log.note("Number of csets in csetLog table: {{num}}", num=numrevs) if numrevs >= SIGNAL_MAINTENANCE_CSETS: return True return False def add_cset_entries(self, ordered_rev_list, timestamp=False, number_forward=True): ''' Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered based on how changesets are found in the changelog. Going forwards or backwards is dealt with by flipping the list :param ordered_cset_list: Order given from changeset log searching. :param timestamp: If false, records are kept indefinitely but if holes exist: (delete, None, delete, None) those delete's with None's around them will not be deleted. :param numbered: If True, this function will number the revision list by going forward from max(revNum), else it'll go backwards from revNum, then add X to all revnums and self.next_revnum where X is the length of ordered_rev_list :return: ''' with self.conn.transaction() as t: current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0] current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0] if not current_min or not current_max: current_min = 0 current_max = 0 direction = -1 start = current_min - 1 if number_forward: direction = 1 start = current_max + 1 ordered_rev_list = ordered_rev_list[::-1] insert_list = [(start + direction * count, rev, int(time.time()) if timestamp else -1) for count, rev in enumerate(ordered_rev_list)] # In case of overlapping requests fmt_insert_list = [] for cset_entry in insert_list: tmp = self._get_one_revision(t, cset_entry) if not tmp: fmt_insert_list.append(cset_entry) for _, tmp_insert_list in jx.groupby(fmt_insert_list, size=SQL_CSET_BATCH_SIZE): t.execute( "INSERT INTO csetLog (revnum, revision, timestamp)" + " VALUES " + sql_list( quote_set((revnum, revision, timestamp)) for revnum, revision, timestamp in tmp_insert_list)) # Move the revision numbers forward if needed self.recompute_table_revnums() # Start a maintenance run if needed if self.check_for_maintenance(): Log.note("Scheduling maintenance run on clogger.") self.maintenance_signal.go() def _fill_in_range(self, parent_cset, child_cset, timestamp=False, number_forward=True): ''' Fills cset logs in a certain range. 'parent_cset' can be an int and in that case, we get that many changesets instead. If parent_cset is an int, then we consider that we are going backwards (number_forward is False) and we ignore the first changeset of the first log, and we ignore the setting for number_forward. Otherwise, we continue until we find the given 'parent_cset'. :param parent_cset: :param child_cset: :param timestamp: :param number_forward: :return: ''' csets_to_add = [] found_parent = False find_parent = False if type(parent_cset) != int: find_parent = True elif parent_cset >= MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG: Log.warning( "Requested number of new changesets {{num}} is too high. " "Max number that can be requested is {{maxnum}}.", num=parent_cset, maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG) return None csets_found = 0 clogs_seen = 0 final_rev = child_cset while not found_parent and clogs_seen < MAX_BACKFILL_CLOGS: clog_url = str( HG_URL ) + "/" + self.config.hg.branch + "/json-log/" + final_rev clog_obj = self._get_clog(clog_url) clog_csets_list = list(clog_obj['changesets']) for clog_cset in clog_csets_list[:-1]: if not number_forward and csets_found <= 0: # Skip this entry it already exists csets_found += 1 continue nodes_cset = clog_cset['node'][:12] if find_parent: if nodes_cset == parent_cset: found_parent = True if not number_forward: # When going forward this entry is # the given parent csets_to_add.append(nodes_cset) break else: if csets_found + 1 > parent_cset: found_parent = True if not number_forward: # When going forward this entry is # the given parent (which is supposed # to already exist) csets_to_add.append(nodes_cset) break csets_found += 1 csets_to_add.append(nodes_cset) if found_parent == True: break clogs_seen += 1 final_rev = clog_csets_list[-1]['node'][:12] if found_parent: self.add_cset_entries(csets_to_add, timestamp=timestamp, number_forward=number_forward) else: Log.warning( "Couldn't find the end of the request for {{request}}. " "Max number that can be requested through _fill_in_range is {{maxnum}}.", request={ 'parent_cset': parent_cset, 'child_cset': child_cset, 'number_forward': number_forward }, maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG) return None return csets_to_add def initialize_to_range(self, old_rev, new_rev, delete_old=True): ''' Used in service testing to get to very old changesets quickly. :param old_rev: The oldest revision to keep :param new_rev: The revision to start searching from :return: ''' old_settings = [ self.disable_tipfilling, self.disable_backfilling, self.disable_maintenance, self.disable_deletion ] self.disable_tipfilling = True self.disable_backfilling = True self.disable_maintenance = True self.disable_deletion = True old_rev = old_rev[:12] new_rev = new_rev[:12] with self.working_locker: if delete_old: with self.conn.transaction() as t: t.execute("DELETE FROM csetLog") with self.conn.transaction() as t: t.execute("INSERT INTO csetLog (revision, timestamp) VALUES " + quote_set((new_rev, -1))) self._fill_in_range(old_rev, new_rev, timestamp=True, number_forward=False) self.disable_tipfilling = old_settings[0] self.disable_backfilling = old_settings[1] self.disable_maintenance = old_settings[2] self.disable_deletion = old_settings[3] def fill_backward_with_list(self, please_stop=None): ''' Expects requests of the tuple form: (parent_cset, timestamp) parent_cset can be an int X to go back by X changesets, or a string to search for going backwards in time. If timestamp is false, no timestamps will be added to the entries. :param please_stop: :return: ''' while not please_stop: try: request = self.csets_todo_backwards.pop(till=please_stop) if please_stop: break # If backfilling is disabled, all requests # are ignored. if self.disable_backfilling: Till(till=CSET_BACKFILL_WAIT_TIME).wait() continue if request: parent_cset, timestamp = request else: continue with self.working_locker: with self.conn.transaction() as t: parent_revnum = self._get_one_revnum(t, parent_cset) if parent_revnum: continue with self.conn.transaction() as t: _, oldest_revision = self.get_tail(t) self._fill_in_range(parent_cset, oldest_revision, timestamp=timestamp, number_forward=False) Log.note("Finished {{cset}}", cset=parent_cset) except Exception as e: Log.warning("Unknown error occurred during backfill: ", cause=e) def update_tip(self): ''' Returns False if the tip is already at the newest, or True if an update has taken place. :return: ''' clog_obj = self._get_clog( str(HG_URL) + "/" + self.config.hg.branch + "/json-log/tip") # Get current tip in DB with self.conn.transaction() as t: _, newest_known_rev = self.get_tip(t) # If we are still at the newest, wait for CSET_TIP_WAIT_TIME seconds # before checking again. first_clog_entry = clog_obj['changesets'][0]['node'][:12] if newest_known_rev == first_clog_entry: return False csets_to_gather = None if not newest_known_rev: Log.note( "No revisions found in table, adding {{minim}} entries...", minim=MINIMUM_PERMANENT_CSETS) csets_to_gather = MINIMUM_PERMANENT_CSETS found_newest_known = False csets_to_add = [] csets_found = 0 clogs_seen = 0 Log.note("Found new revisions. Updating csetLog tip to {{rev}}...", rev=first_clog_entry) while not found_newest_known and clogs_seen < MAX_TIPFILL_CLOGS: clog_csets_list = list(clog_obj['changesets']) for clog_cset in clog_csets_list[:-1]: nodes_cset = clog_cset['node'][:12] if not csets_to_gather: if nodes_cset == newest_known_rev: found_newest_known = True break else: if csets_found >= csets_to_gather: found_newest_known = True break csets_found += 1 csets_to_add.append(nodes_cset) if not found_newest_known: # Get the next page clogs_seen += 1 final_rev = clog_csets_list[-1]['node'][:12] clog_url = str( HG_URL ) + "/" + self.config.hg.branch + "/json-log/" + final_rev clog_obj = self._get_clog(clog_url) if clogs_seen >= MAX_TIPFILL_CLOGS: Log.error( "Too many changesets, can't find last tip or the number is too high: {{rev}}. " "Maximum possible to request is {{maxnum}}", rev=coalesce(newest_known_rev, csets_to_gather), maxnum=MAX_TIPFILL_CLOGS * CHANGESETS_PER_CLOG) return False with self.working_locker: Log.note("Adding {{csets}}", csets=csets_to_add) self.add_cset_entries(csets_to_add, timestamp=False) return True def fill_forward_continuous(self, please_stop=None): while not please_stop: try: while not please_stop and not self.disable_tipfilling and self.update_tip( ): pass (please_stop | Till(seconds=CSET_TIP_WAIT_TIME)).wait() except Exception as e: Log.warning("Unknown error occurred during tip filling:", cause=e) def csetLog_maintenance(self, please_stop=None): ''' Handles deleting old csetLog entries and timestamping revisions once they pass the length for permanent storage for deletion later. :param please_stop: :return: ''' while not please_stop: try: # Wait until something signals the maintenance cycle # to begin (or end). (self.maintenance_signal | please_stop).wait() if please_stop: break if self.disable_maintenance: continue Log.warning( "Starting clog maintenance. Since this doesn't start often, " "we need to explicitly see when it's started with this warning." ) # Reset signal so we don't request # maintenance infinitely. with self.maintenance_signal.lock: self.maintenance_signal._go = False with self.working_locker: all_data = None with self.conn.transaction() as t: all_data = sorted(t.get( "SELECT revnum, revision, timestamp FROM csetLog"), key=lambda x: int(x[0])) # Restore maximum permanents (if overflowing) new_data = [] modified = False for count, (revnum, revision, timestamp) in enumerate(all_data[::-1]): if count < MINIMUM_PERMANENT_CSETS: if timestamp != -1: modified = True new_data.append((revnum, revision, -1)) else: new_data.append((revnum, revision, timestamp)) elif type(timestamp) != int or timestamp == -1: modified = True new_data.append( (revnum, revision, int(time.time()))) else: new_data.append((revnum, revision, timestamp)) # Delete annotations at revisions with timestamps # that are too old. The csetLog entries will have # their timestamps reset here. new_data1 = [] annrevs_to_del = [] current_time = time.time() for count, (revnum, revision, timestamp) in enumerate(new_data[::-1]): new_timestamp = timestamp if timestamp != -1: if current_time >= timestamp + TIME_TO_KEEP_ANNOTATIONS.seconds: modified = True new_timestamp = current_time annrevs_to_del.append(revision) new_data1.append((revnum, revision, new_timestamp)) if len(annrevs_to_del) > 0: # Delete any latestFileMod and annotation entries # that are too old. Log.note( "Deleting annotations and latestFileMod for revisions for being " "older than {{oldest}}: {{revisions}}", oldest=TIME_TO_KEEP_ANNOTATIONS, revisions=annrevs_to_del) with self.conn.transaction() as t: t.execute( "DELETE FROM latestFileMod WHERE revision IN " + quote_set(annrevs_to_del)) t.execute( "DELETE FROM annotations WHERE revision IN " + quote_set(annrevs_to_del)) # Delete any overflowing entries new_data2 = new_data1 reved_all_data = all_data[::-1] deleted_data = reved_all_data[MAXIMUM_NONPERMANENT_CSETS:] delete_overflowing_revstart = None if len(deleted_data) > 0: _, delete_overflowing_revstart, _ = deleted_data[0] new_data2 = set(all_data) - set(deleted_data) # Update old frontiers if requested, otherwise # they will all get deleted by the csetLog_deleter # worker if UPDATE_VERY_OLD_FRONTIERS: _, max_revision, _ = all_data[-1] for _, revision, _ in deleted_data: with self.conn.transaction() as t: old_files = t.get( "SELECT file FROM latestFileMod WHERE revision=?", (revision, )) if old_files is None or len(old_files) <= 0: continue self.tuid_service.get_tuids_from_files( old_files, max_revision, going_forward=True, ) still_exist = True while still_exist and not please_stop: Till(seconds=TUID_EXISTENCE_WAIT_TIME ).wait() with self.conn.transaction() as t: old_files = t.get( "SELECT file FROM latestFileMod WHERE revision=?", (revision, )) if old_files is None or len( old_files) <= 0: still_exist = False # Update table and schedule a deletion if modified: with self.conn.transaction() as t: insert_into_db_chunked( t, new_data2, "INSERT OR REPLACE INTO csetLog (revnum, revision, timestamp) VALUES " ) if not deleted_data: continue Log.note("Scheduling {{num_csets}} for deletion", num_csets=len(deleted_data)) self.deletions_todo.add(delete_overflowing_revstart) except Exception as e: Log.warning( "Unexpected error occured while maintaining csetLog, continuing to try: ", cause=e) return def csetLog_deleter(self, please_stop=None): ''' Deletes changesets from the csetLog table and also changesets from the annotation table that have revisions matching the given changesets. Accepts lists of csets from self.deletions_todo. :param please_stop: :return: ''' while not please_stop: try: request = self.deletions_todo.pop(till=please_stop) if please_stop: break # If deletion is disabled, ignore the current # request - it will need to be re-requested. if self.disable_deletion: Till(till=CSET_DELETION_WAIT_TIME).wait() continue with self.working_locker: first_cset = request # Since we are deleting and moving stuff around in the # TUID tables, we need everything to be contained in # one transaction with no interruptions. with self.conn.transaction() as t: revnum = self._get_one_revnum(t, first_cset)[0] csets_to_del = t.get( "SELECT revnum, revision FROM csetLog WHERE revnum <= ?", (revnum, )) csets_to_del = [cset for _, cset in csets_to_del] existing_frontiers = t.query( "SELECT revision FROM latestFileMod WHERE revision IN " + quote_set(csets_to_del)).data existing_frontiers = [ existing_frontiers[i][0] for i, _ in enumerate(existing_frontiers) ] Log.note( "Deleting all annotations and changeset log entries with revisions in the list: {{csets}}", csets=csets_to_del) if len(existing_frontiers) > 0: # This handles files which no longer exist anymore in # the main branch. Log.note( "Deleting existing frontiers for revisions: {{revisions}}", revisions=existing_frontiers) t.execute( "DELETE FROM latestFileMod WHERE revision IN " + quote_set(existing_frontiers)) Log.note("Deleting annotations...") t.execute( "DELETE FROM annotations WHERE revision IN " + quote_set(csets_to_del)) Log.note("Deleting {{num_entries}} csetLog entries...", num_entries=len(csets_to_del)) t.execute("DELETE FROM csetLog WHERE revision IN " + quote_set(csets_to_del)) # Recalculate the revnums self.recompute_table_revnums() except Exception as e: Log.warning( "Unexpected error occured while deleting from csetLog:", cause=e) Till(seconds=CSET_DELETION_WAIT_TIME).wait() return def get_old_cset_revnum(self, revision): self.csets_todo_backwards.add((revision, True)) revnum = None timeout = Till(seconds=BACKFILL_REVNUM_TIMEOUT) while not timeout: with self.conn.transaction() as t: revnum = self._get_one_revnum(t, revision) if revnum and revnum[0] >= 0: break elif revnum[0] < 0: Log.note("Waiting for table to recompute...") else: Log.note("Waiting for backfill to complete...") Till(seconds=CSET_BACKFILL_WAIT_TIME).wait() if timeout: Log.error( "Cannot find revision {{rev}} after waiting {{timeout}} seconds", rev=revision, timeout=BACKFILL_REVNUM_TIMEOUT) return revnum def get_revnnums_from_range(self, revision1, revision2): with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) revnum2 = self._get_one_revnum(t, revision2) if not revnum1 or not revnum2: did_an_update = self.update_tip() if did_an_update: with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) revnum2 = self._get_one_revnum(t, revision2) if not revnum1: revnum1 = self.get_old_cset_revnum(revision1) # Refresh the second entry with self.conn.transaction() as t: revnum2 = self._get_one_revnum(t, revision2) if not revnum2: revnum2 = self.get_old_cset_revnum(revision2) # The first revnum might change also with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) with self.conn.transaction() as t: result = self._get_revnum_range(t, revnum1[0], revnum2[0]) return sorted(result, key=lambda x: int(x[0]))
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @override def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(kwargs=kwargs) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.es_index, c.names["."]) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: if DEBUG: Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical is not c: set_default(c.names, canonical.names) for key in Column.__slots__: canonical[key] = c[key] if DEBUG: Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE table_path = split_field(table) es_index = table_path[0] query_path = join_field(table_path[1:]) meta = self.es_metadata.indices[es_index] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[es_index] for _, properties in meta.mappings.items(): properties.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") and r.es_column.find("=") == -1 and r.es_column.find(" ") == -1 ) def add_column(c, query_path): c.last_updated = Date.now() if query_path[0] != ".": c.names[query_path[0]] = relative_field(c.names["."], query_path[0]) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.es_index = alias self._upsert_column(c) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: for query_path in query_paths: add_column(abs_column, query_path) def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field(query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table( name=es_index_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name) def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in STRUCT: Log.error("not supported") try: if c.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return if c.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return es_index = c.es_index.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": {c.names["."]: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None) if cardinality == None: Log.error("logic error") query = Data(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif len(c.nested_path) != 1: query.aggs[literal_field(c.names["."])] = { "nested": {"path": c.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}} } else: query.aggs[literal_field(c.names["."])] = {"terms": {"field": c.es_column, "size": 0}} result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) except Exception as e: if "IndexMissingException" in e and c.es_index.startswith(TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear":[ "partitions" ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": {"eq": {"names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: with self.meta.columns.locker: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT ] if old_columns: if DEBUG: Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: if DEBUG: Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if DEBUG: Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column.type in STRUCT: with self.meta.columns.locker: column.last_updated = Date.now() continue elif column.last_updated >= Date.now()-TOO_OLD: continue try: self._update_cardinality(column) if DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX): Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD: continue with self.meta.columns.locker: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear":[ "count", "cardinality", "partitions", ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) if DEBUG: Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ @override def __new__(cls, kwargs, *args, **_kwargs): es_cluster = elasticsearch.Cluster(kwargs) output = known_clusters.get(id(es_cluster)) if output is None: output = object.__new__(cls) known_clusters[id(es_cluster)] = output return output @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = {} self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList(URL(self.es_cluster.settings.host).host) self.alias_to_query_paths = { "meta.columns": [ROOT_PATH], "meta.tables": [ROOT_PATH] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [], jx_base.Schema(".", table_columns) ) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("not refresh metadata", self.not_monitor) return @property def namespace(self): return self.meta.columns.namespace @property def url(self): return self.es_cluster.url / self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index es_metadata_update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata(force=es_metadata_update_required) props = [ (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if alias in d.aliases for t, m in [_get_best_type_from_mapping(d.mappings)] ] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} columns = self._parse_properties(alias, mapping) table_desc.timestamp = es_last_updated return columns def _parse_properties(self, alias, mapping): abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties) if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns): Log.warning( "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}", url=self.es_cluster.url, index=alias, names=[ ".".join((c.es_index, c.name)) for c in abs_columns if c.cardinality == 0 ] ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ENSURE ALL TABLES HAVE THE QUERY PATHS SET self.alias_to_query_paths[alias] = query_paths for i, a in self.index_to_alias.items(): if a == alias: self.alias_to_query_paths[i] = query_paths # ENSURE COLUMN HAS CORRECT jx_type # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE output = [] best = {} for abs_column in abs_columns: abs_column.jx_type = jx_type(abs_column) if abs_column.jx_type not in STRUCT: clean_name = unnest_path(abs_column.name) other = best.get(clean_name) if other: if len(other.nested_path) < len(abs_column.nested_path): output.remove(other) self.meta.columns.update({"clear": ".", "where": {"eq": {"es_column": other.es_column, "es_index": other.es_index}}}) else: continue best[clean_name] = abs_column output.append(abs_column) # REGISTER ALL COLUMNS canonicals = [] for abs_column in output: canonical = self.meta.columns.add(abs_column) canonicals.append(canonical) self.todo.extend(canonicals) return canonicals def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias.get(name) def get_columns(self, table_name, column_name=None, after=None, timeout=None): """ RETURN METADATA COLUMNS :param table_name: TABLE WE WANT COLUMNS FOR :param column_name: OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME :param timeout: Signal; True when should give up :return: """ DEBUG and after and Log.note("getting columns for after {{time}}", time=after) table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc( name=alias, url=None, query_path=["."], timestamp=Date.MIN ) with self.meta.tables.locker: self.meta.tables.add(table) columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") elif after or table.timestamp < self.es_cluster.metatdata_last_updated: columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") else: columns = self.meta.columns.find(alias, column_name) DEBUG and Log.note("columns from find()") DEBUG and Log.note("columns are {{ids}}", ids=[id(c) for c in columns]) columns = jx.sort(columns, "name") if after is None: return columns # DO NOT WAIT FOR COMPLETE COLUMNS # WAIT FOR THE COLUMNS TO UPDATE while True: pending = [c for c in columns if after >= c.last_updated or (c.cardinality == None and c.jx_type not in STRUCT)] if not pending: break if timeout: Log.error("trying to gets columns timed out") if DEBUG: if len(pending) > 10: Log.note("waiting for {{num}} columns to update by {{timestamp}}", num=len(pending), timestamp=after) else: Log.note("waiting for columns to update by {{timestamp}}; {{columns|json}}", timestamp=after, columns=[c.es_index + "." + c.es_column + " id="+text_type(id(c)) for c in pending]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Failure to get columns for {{table}}", table=table_name, cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ now = Date.now() if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": [False, True], "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return else: es_query = { "aggs": { "count": _counting_query(column), "_filter": { "aggs": {"multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}}, "filter": {"bool": {"should": [ {"range": {"etl.timestamp.~n~": {"gte": (Date.today() - WEEK)}}}, {"bool": {"must_not": {"exists": {"field": "etl.timestamp.~n~"}}}} ]}} } }, "size": 0 } result = self.es_cluster.post("/" + es_index + "/_search", data=es_query) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results._filter.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: # WHEN DOES THIS HAPPEN? query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) DEBUG and Log.note("update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index: # WE EXPECT TEST TABLES TO DISAPPEAR Log.warning("Missing index {{col.es_index}}", col=column, cause=e) self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) elif "No field found for" in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) else: self.meta.columns.update({ "set": { "last_updated": now }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[Date(t).format() for t in wrap(old_columns).last_updated] ) self.todo.extend(old_columns) else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG): if column.es_index in self.index_does_not_exist: DEBUG and Log.note("{{column.es_column}} does not exist", column=column) self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) continue if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) else: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: column = self.todo.pop() if column == THREAD_STOP: break if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if untype_path(column.name) in ["build.type", "run.type"]: try: self._update_cardinality(column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) else: column.last_updated = Date.now() def get_table(self, name): if name == "meta.columns": return self.meta.columns with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema if name == "meta.tables": return self.meta.tables root, rest = tail_field(name) return self.get_snowflake(root).get_schema(rest)
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ def __new__(cls, *args, **kwargs): if jx_base_meta.singlton: return jx_base_meta.singlton else: jx_base_meta.singlton = object.__new__(cls) return jx_base_meta.singlton @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = Relation_usingList() self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList() self.alias_to_query_paths = { "meta.columns": [['.']], "meta.tables": [['.']] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer("meta.tables", [], jx_base.Schema(".", table_columns)) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def url(self): return self.es_cluster.path + "/" + self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata(force=update_required) indexes = self.index_to_alias.get_domain(alias) props = [(self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if i in indexes for t, m in [_get_best_type_from_mapping(d.mappings)]] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list( jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata( force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(alias, mapping, meta) table_desc.timestamp = es_last_updated def _parse_properties(self, alias, mapping, meta): abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(SELF_PATH) query_paths.append(ROOT_PATH) self.alias_to_query_paths[alias] = query_paths # ADD RELATIVE NAMES for abs_column in abs_columns: abs_column.last_updated = None abs_column.jx_type = es_type_to_json_type[abs_column.es_type] for query_path in query_paths: abs_column.names[query_path[0]] = relative_field( abs_column.names["."], query_path[0]) self.todo.add(self.meta.columns.add(abs_column)) pass def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__()))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias[name] def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: last_update = MAX([ self.es_cluster.index_last_updated[i] for i in self.index_to_alias.get_domain(alias) ]) table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc(name=alias, url=None, query_path=['.'], timestamp=Date.MIN) with self.meta.tables.locker: self.meta.tables.add(table) self._reload_columns(table) elif force or table.timestamp < last_update: self._reload_columns(table) columns = self.meta.columns.find(alias, column_name) columns = jx.sort(columns, "names.\\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: if len(columns) > 10: Log.note("waiting for {{num}} columns to update", num=len([ c for c in columns if not c.last_updated ])) else: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = 1001 multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 multi = 1 else: result = self.es_cluster.post( "/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "size": 0 }) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{field}} has {{num}} parts", field=column.es_index, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = any( column.es_index.startswith(t) for t in [TEST_TABLE_PREFIX, TEST_TABLE]) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "names.\\.": ".", "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now() - TOO_OLD) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[ Date(t).format() for t in wrap(old_columns).last_updated ]) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds)) if column: if column is THREAD_STOP: continue DEBUG and Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column.es_index in self.index_does_not_exist: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) continue if column.jx_type in STRUCT or column.es_column.endswith( "." + EXISTS_TYPE): column.last_updated = Date.now() continue elif column.last_updated >= Date.now() - TOO_OLD: continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX) ) and Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if c.last_updated >= Date.now() - TOO_OLD: continue self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) DEBUG and Log.note( "Did not get {{col.es_index}}.{{col.es_column}} info", col=c) def get_table(self, alias_name): with self.meta.tables.locker: return wrap( [t for t in self.meta.tables.data if t.name == alias_name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema query_path = split_field(name) root, rest = query_path[0], join_field(query_path[1:]) return self.get_snowflake(root).get_schema(rest)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): if jx_base_meta.singlton: return jx_base_meta.singlton else: jx_base_meta.singlton = object.__new__(cls) return jx_base_meta.singlton @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.abs_columns = set() self.last_es_metadata = Date.now() - OLD_METADATA self.meta = Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap( [t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.es_index, c.names["."]) for canonical in existing_columns: if canonical.type == c.type and canonical is not c: set_default(c.names, canonical.names) for key in Column.__slots__: canonical[key] = c[key] if DEBUG: Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column) self.todo.add(canonical) break else: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: if DEBUG: Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() - TOO_OLD self.todo.extend(cols) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE table_path = split_field(table) es_index = table_path[0] meta = self.es_metadata.indices[es_index] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[es_index] for data_type, properties in meta.mappings.items(): if data_type == "_default_": continue properties.properties["_id"] = { "type": "string", "index": "not_analyzed" } self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES def add_column(c, query_path): c.last_updated = Date.now() - TOO_OLD if query_path[0] != ".": c.names[query_path[0]] = relative_field( c.names["."], query_path[0]) with self.meta.columns.locker: for alias in meta.aliases: c_ = copy(c) c_.es_index = alias self._upsert_column(c_) self._upsert_column(c) abs_columns = elasticsearch.parse_properties(abs_index, None, properties.properties) self.abs_columns.update(abs_columns) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(SELF_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: abs_column = abs_column.__copy__() abs_column.type = es_type_to_json_type[abs_column.type] for query_path in query_paths: add_column(abs_column, query_path) pass def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__()))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field( query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table(name=es_index_name, url=None, query_path=['.'], timestamp=Date.now()) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now( ) - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name) def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby( self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby( self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.abs_columns if cc.es_column == column.es_column and cc.type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = 1001 multi = 1001 elif column.es_column == "_id": result = self.default_es.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 else: result = self.default_es.post( "/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "size": 0 }) r = result.aggregations.count count = result.hits.total cardinality = coalesce(r.value, r._nested.value, r.doc_count) multi = coalesce(r.multi.value, 1) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=column.es_index, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=column.names["."], parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = any( column.es_index.startswith(t) for t in [TEST_TABLE_PREFIX, TEST_TABLE]) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR with self.meta.columns.locker: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "names.\\.": ".", "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: with self.meta.columns.locker: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now() - TOO_OLD) and c.type not in STRUCT ] if old_columns: if DEBUG: Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[ Date(t).format() for t in wrap(old_columns).last_updated ]) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: if DEBUG: Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds)) if DEBUG: Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column: if column.es_index in self.index_does_not_exist: with self.meta.columns.locker: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) continue if column.type in STRUCT or column.es_column.endswith( "." + EXISTS_TYPE): with self.meta.columns.locker: column.last_updated = Date.now() continue elif column.last_updated >= Date.now() - TOO_OLD: continue try: self._update_cardinality(column) if DEBUG and not column.es_index.startswith( TEST_TABLE_PREFIX): Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if not c.last_updated or c.last_updated >= Date.now() - TOO_OLD: continue with self.meta.columns.locker: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) if DEBUG: Log.note( "Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ @override def __new__(cls, kwargs, *args, **_kwargs): es_cluster = elasticsearch.Cluster(kwargs) output = known_clusters.get(id(es_cluster)) if output is None: output = object.__new__(cls) known_clusters[id(es_cluster)] = output return output @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = {} self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList(URL(self.es_cluster.settings.host).host) self.alias_to_query_paths = { "meta.columns": [ROOT_PATH], "meta.tables": [ROOT_PATH] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer("meta.tables", [], jx_base.Schema(".", table_columns)) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("not refresh metadata", self.not_monitor) return @property def namespace(self): return self.meta.columns.namespace @property def url(self): return self.es_cluster.url / self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index es_metadata_update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata( force=es_metadata_update_required) props = [(self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if alias in d.aliases for t, m in [_get_best_type_from_mapping(d.mappings)]] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list( jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata( force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} columns = self._parse_properties(alias, mapping) table_desc.timestamp = es_last_updated return columns def _parse_properties(self, alias, mapping): abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties) if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns): Log.warning( "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}", url=self.es_cluster.url, index=alias, names=[ ".".join((c.es_index, c.name)) for c in abs_columns if c.cardinality == 0 ]) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ENSURE ALL TABLES HAVE THE QUERY PATHS SET self.alias_to_query_paths[alias] = query_paths for i, a in self.index_to_alias.items(): if a == alias: self.alias_to_query_paths[i] = query_paths # ENSURE COLUMN HAS CORRECT jx_type # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE output = [] best = {} for abs_column in abs_columns: abs_column.jx_type = jx_type(abs_column) if abs_column.jx_type not in STRUCT: clean_name = unnest_path(abs_column.name) other = best.get(clean_name) if other: if len(other.nested_path) < len( abs_column.nested_path): output.remove(other) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_column": other.es_column, "es_index": other.es_index } } }) else: continue best[clean_name] = abs_column output.append(abs_column) # REGISTER ALL COLUMNS canonicals = [] for abs_column in output: canonical = self.meta.columns.add(abs_column) canonicals.append(canonical) self.todo.extend(canonicals) return canonicals def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__()))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias.get(name) def get_columns(self, table_name, column_name=None, after=None, timeout=None): """ RETURN METADATA COLUMNS :param table_name: TABLE WE WANT COLUMNS FOR :param column_name: OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME :param timeout: Signal; True when should give up :return: """ DEBUG and after and Log.note("getting columns for after {{time}}", time=after) table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc(name=alias, url=None, query_path=["."], timestamp=Date.MIN) with self.meta.tables.locker: self.meta.tables.add(table) columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") elif after or table.timestamp < self.es_cluster.metatdata_last_updated: columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") else: columns = self.meta.columns.find(alias, column_name) DEBUG and Log.note("columns from find()") DEBUG and Log.note("columns are {{ids}}", ids=[id(c) for c in columns]) columns = jx.sort(columns, "name") if after is None: return columns # DO NOT WAIT FOR COMPLETE COLUMNS # WAIT FOR THE COLUMNS TO UPDATE while True: pending = [ c for c in columns if after >= c.last_updated or ( c.cardinality == None and c.jx_type not in STRUCT) ] if not pending: break if timeout: Log.error("trying to gets columns timed out") if DEBUG: if len(pending) > 10: Log.note( "waiting for {{num}} columns to update by {{timestamp}}", num=len(pending), timestamp=after) else: Log.note( "waiting for columns to update by {{timestamp}}; {{columns|json}}", timestamp=after, columns=[ c.es_index + "." + c.es_column + " id=" + text_type(id(c)) for c in pending ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Failure to get columns for {{table}}", table=table_name, cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ now = Date.now() if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": [False, True], "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return else: es_query = { "aggs": { "count": _counting_query(column), "_filter": { "aggs": { "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "filter": { "bool": { "should": [{ "range": { "etl.timestamp.~n~": { "gte": (Date.today() - WEEK) } } }, { "bool": { "must_not": { "exists": { "field": "etl.timestamp.~n~" } } } }] } } } }, "size": 0 } result = self.es_cluster.post("/" + es_index + "/_search", data=es_query) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results._filter.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: # WHEN DOES THIS HAPPEN? query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) DEBUG and Log.note( "update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith( (TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index: # WE EXPECT TEST TABLES TO DISAPPEAR Log.warning("Missing index {{col.es_index}}", col=column, cause=e) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) elif "No field found for" in e: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) else: self.meta.columns.update({ "set": { "last_updated": now }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[ Date(t).format() for t in wrap(old_columns).last_updated ]) self.todo.extend(old_columns) else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={ "table": column.es_index, "column": column.es_column }, silent=not DEBUG): if column.es_index in self.index_does_not_exist: DEBUG and Log.note( "{{column.es_column}} does not exist", column=column) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) continue if column.jx_type in STRUCT or split_field( column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note( "{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now( ) - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note( "{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now() - Date(column.last_updated)).seconds) continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX) ) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) else: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: column = self.todo.pop() if column == THREAD_STOP: break if column.jx_type in STRUCT or split_field( column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now( ) - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note( "{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now() - Date(column.last_updated)).seconds) continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if untype_path(column.name) in ["build.type", "run.type"]: try: self._update_cardinality(column) except Exception as e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) else: column.last_updated = Date.now() def get_table(self, name): if name == "meta.columns": return self.meta.columns with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema if name == "meta.tables": return self.meta.tables root, rest = tail_field(name) return self.get_snowflake(root).get_schema(rest)