class StructuredLogger_usingThread(StructuredLogger): def __init__(self, logger): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True) self.logger = logger def worker(logger, please_stop): try: while not please_stop: logs = self.queue.pop_all() if not logs: (Till(seconds=1) | please_stop).wait() continue for log in logs: if log is THREAD_STOP: please_stop.go() else: logger.write(**log) except Exception as e: print("problem in " + StructuredLogger_usingThread.__name__ + ": " + str(e)) finally: Log.note("stop the child") logger.stop() self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger) self.thread.parent.remove_child( self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: e = Except.wrap(e) raise e # OH NO! def stop(self): try: self.queue.add( THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() Log.note("joined on thread") except Exception as e: Log.note("problem in threaded logger" + str(e)) with suppress_exception: self.queue.close()
class StructuredLogger_usingThread(StructuredLogger): def __init__(self, logger): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True) self.logger = logger def worker(logger, please_stop): try: while not please_stop: logs = self.queue.pop_all() if not logs: (Till(seconds=1) | please_stop).wait() continue for log in logs: if log is THREAD_STOP: please_stop.go() else: logger.write(**log) except Exception as e: print("problem in " + StructuredLogger_usingThread.__name__ + ": " + str(e)) finally: Log.note("stop the child") logger.stop() self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: e = Except.wrap(e) raise e # OH NO! def stop(self): try: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() Log.note("joined on thread") except Exception as e: Log.note("problem in threaded logger" + str(e)) with suppress_exception: self.queue.close()
class StructuredLogger_usingElasticSearch(StructuredLogger): @override def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] rollover_interval = coalesce(kwargs.rollover.interval, kwargs.rollover.max, "year") rollover_max = coalesce(kwargs.rollover.max, kwargs.rollover.interval, "year") schema = set_default( kwargs.schema, {"mappings": {kwargs.type: {"properties": {"~N~": {"type": "nested"}}}}}, json2value(value2json(SCHEMA), leaves=True) ) self.es = RolloverIndex( rollover_field={"get": [{"first": "."}, {"literal": "timestamp"}]}, rollover_interval=rollover_interval, rollover_max=rollover_max, schema=schema, limit_replicas=True, typed=True, read_only=False, kwargs=kwargs, ) self.batch_size = batch_size self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): try: params.template = strings.limit(params.template, 2000) params.format = None self.queue.add({"value": _deep_json_to_string(params, 3)}, timeout=3 * 60) except Exception as e: sys.stdout.write(text(Except.wrap(e))) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: messages = wrap(self.queue.pop_all()) if not messages: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() continue for g, mm in jx.chunk(messages, size=self.batch_size): scrubbed = [] for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() continue try: chain = flatten_causal_chain(message.value) scrubbed.append( { "value": [ _deep_json_to_string(link, depth=3) for link in chain ] } ) except Exception as e: Log.warning("Problem adding to scrubbed list", cause=e) self.es.extend(scrubbed) bad_count = 0 except Exception as f: Log.warning("Problem inserting logs into ES", cause=f) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning( "Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index, ) break Till(seconds=PAUSE_AFTER_BAD_INSERT).wait() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e) def stop(self): with suppress_exception: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT with suppress_exception: self.queue.close() self.worker.join()
class ColumnList(Table, jx_base.Container): """ OPTIMIZED FOR THE PARTICULAR ACCESS PATTERNS USED """ def __init__(self, es_cluster): Table.__init__(self, META_COLUMNS_NAME) self.data = {} # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS) self.locker = Lock() self._schema = None self.dirty = False self.es_cluster = es_cluster self.es_index = None self.last_load = Null self.todo = Queue( "update columns to es" ) # HOLD (action, column) PAIR, WHERE action in ['insert', 'update'] self._db_load() Thread.run("update " + META_COLUMNS_NAME, self._update_from_es, parent_thread=MAIN_THREAD) def _query(self, query): result = Data() curr = self.es_cluster.execute(query) result.meta.format = "table" result.header = [d[0] for d in curr.description ] if curr.description else None result.data = curr.fetchall() return result def _db_create(self): schema = { "settings": { "index.number_of_shards": 1, "index.number_of_replicas": 6 }, "mappings": { META_COLUMNS_TYPE_NAME: {} }, } self.es_index = self.es_cluster.create_index(id=ID, index=META_COLUMNS_NAME, schema=schema) self.es_index.add_alias(META_COLUMNS_NAME) for c in META_COLUMNS_DESC.columns: self._add(c) self.es_index.add({"value": c.__dict__()}) def _db_load(self): self.last_load = Date.now() try: self.es_index = self.es_cluster.get_index( id=ID, index=META_COLUMNS_NAME, type=META_COLUMNS_TYPE_NAME, read_only=False) result = self.es_index.search({ "query": { "bool": { "should": [ { "bool": { "must_not": { "exists": { "field": "cardinality.~n~" } } } }, { # ASSUME UNUSED COLUMNS DO NOT EXIST "range": { "cardinality.~n~": { "gt": 0 } } }, ] } }, "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"], "size": 10000, }) Log.note("{{num}} columns loaded", num=result.hits.total) with self.locker: for r in result.hits.hits._source: self._add(doc_to_column(r)) except Exception as e: Log.warning("no {{index}} exists, making one", index=META_COLUMNS_NAME, cause=e) self._db_create() def _update_from_es(self, please_stop): try: last_extract = Date.now() while not please_stop: now = Date.now() try: if (now - last_extract).seconds > COLUMN_EXTRACT_PERIOD: result = self.es_index.search({ "query": { "range": { "last_updated.~n~": { "gte": self.last_load } } }, "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"], "from": 0, "size": 10000, }) last_extract = now with self.locker: for r in result.hits.hits._source: c = doc_to_column(r) self._add(c) self.last_load = MAX( (self.last_load, c.last_updated)) while not please_stop: updates = self.todo.pop_all() if not updates: break DEBUG and updates and Log.note( "{{num}} columns to push to db", num=len(updates)) self.es_index.extend([{ "value": column.__dict__() } for column in updates]) except Exception as e: Log.warning("problem updating database", cause=e) (Till(seconds=COLUMN_LOAD_PERIOD) | please_stop).wait() finally: Log.note("done") def find(self, es_index, abs_column_name=None): with self.locker: if es_index.startswith("meta."): self._update_meta() if not abs_column_name: return [ c for cs in self.data.get(es_index, {}).values() for c in cs ] else: return self.data.get(es_index, {}).get(abs_column_name, []) def extend(self, columns): self.dirty = True with self.locker: for column in columns: self._add(column) def add(self, column): self.dirty = True with self.locker: canonical = self._add(column) if canonical == None: return column # ALREADY ADDED self.todo.add(canonical) return canonical def remove_table(self, table_name): del self.data[table_name] def _add(self, column): """ :param column: ANY COLUMN OBJECT :return: None IF column IS canonical ALREADY (NET-ZERO EFFECT) """ columns_for_table = self.data.setdefault(column.es_index, {}) existing_columns = columns_for_table.setdefault(column.name, []) for canonical in existing_columns: if canonical is column: return None if canonical.es_type == column.es_type: if column.last_updated > canonical.last_updated: for key in Column.__slots__: old_value = canonical[key] new_value = column[key] if new_value == old_value: pass # NO NEED TO UPDATE WHEN NO CHANGE MADE (COMMON CASE) else: canonical[key] = new_value return canonical existing_columns.append(column) return column def _update_meta(self): if not self.dirty: return now = Date.now() for mc in META_COLUMNS_DESC.columns: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.name] if value == None: pass else: count += 1 if is_list(value): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif is_data(value): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = now META_COLUMNS_DESC.last_updated = now self.dirty = False def _all_columns(self): return [ column for t, cs in self.data.items() for _, css in cs.items() for column in css ] def __iter__(self): with self.locker: self._update_meta() return iter(self._all_columns()) def __len__(self): return self.data[META_COLUMNS_NAME]["es_index"].count def update(self, command): self.dirty = True try: command = wrap(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": d = self.data i = eq.es_index with self.locker: cols = d[i] del d[i] for c in cols: mark_as_deleted(c) self.todo.add(c) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all(c[k] == v for k, v in eq.items()) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": mark_as_deleted(col) self.todo.add(col) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.todo.add(col) except Exception as e: Log.error("should not happen", cause=e) def query(self, query): # NOT EXPECTED TO BE RUN Log.error("not") with self.locker: self._update_meta() if not self._schema: self._schema = Schema(".", [ c for cs in self.data[META_COLUMNS_NAME].values() for c in cs ]) snapshot = self._all_columns() from jx_python.containers.list_usingPythonList import ListContainer query.frum = ListContainer(META_COLUMNS_NAME, snapshot, self._schema) return jx.run(query) def groupby(self, keys): with self.locker: self._update_meta() return jx.groupby(self.__iter__(), keys) def window(self, window): raise NotImplemented() @property def schema(self): if not self._schema: with self.locker: self._update_meta() self._schema = Schema(".", [ c for cs in self.data[META_COLUMNS_NAME].values() for c in cs ]) return self._schema @property def namespace(self): return self def get_table(self, table_name): if table_name != META_COLUMNS_NAME: Log.error("this container has only the " + META_COLUMNS_NAME) return self def get_columns(self, table_name): if table_name != META_COLUMNS_NAME: Log.error("this container has only the " + META_COLUMNS_NAME) return self._all_columns() def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ with self.locker: self._update_meta() output = [ { "table": c.es_index, "name": untype_path(c.name), "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "es_type": c.es_type, "type": c.jx_type, } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.jx_type not in STRUCT # and c.es_column != "_id" ] from jx_python.containers.list_usingPythonList import ListContainer return ListContainer( self.name, data=output, schema=jx_base.Schema(META_COLUMNS_NAME, SIMPLE_METADATA_COLUMNS), )
class StructuredLogger_usingElasticSearch(StructuredLogger): @override def __init__(self, host, index, port=9200, type="log", max_size=1000, batch_size=100, kwargs=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ self.es = Cluster(kwargs).get_or_create_index( schema=mo_json.json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, kwargs=kwargs) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) self.es.settings.retry.sleep = Duration( coalesce(self.es.settings.retry.sleep, MINUTE)) Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): if params.get("template"): # DETECTED INNER TEMPLATE, ASSUME TRACE IS ON, SO DO NOT NEED THE OUTER TEMPLATE self.queue.add({"value": params}) else: template = strings.limit(template, 2000) self.queue.add({"value": { "template": template, "params": params }}, timeout=3 * MINUTE) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: Till(seconds=1).wait() messages = wrap(self.queue.pop_all()) if not messages: continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] try: for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() return scrubbed.append( _deep_json_to_string(message, depth=3)) finally: self.es.extend(scrubbed) bad_count = 0 except Exception as e: Log.warning("Problem inserting logs into ES", cause=e) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning( "Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index) Till(seconds=30).wait() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=1).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e) def stop(self): with suppress_exception: self.queue.add( THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT with suppress_exception: self.queue.close()
class Table(BaseFacts): @override def __init__( self, table, typed, read_only, sharded, container, id=Null, partition=Null, cluster=Null, top_level_fields=Null, kwargs=None, ): self.short_name = table self.typed = typed self.read_only = read_only self.cluster = cluster self.id = id self.top_level_fields = top_level_fields self.config = Data( # USED TO REPLICATE THIS typed=typed, read_only=read_only, sharded=sharded, id=id, partition=partition, cluster=cluster, top_level_fields=top_level_fields, ) esc_name = escape_name(table) self.full_name = container.full_name + esc_name self.alias_view = alias_view = container.client.get_table(text(self.full_name)) self.partition = partition self.container = container if not sharded: if not read_only and alias_view.table_type == "VIEW": Log.error("Expecting a table, not a view") self.shard = alias_view self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) else: if alias_view.table_type != "VIEW": Log.error("Sharded tables require a view") current_view = container.client.get_table(text(self.full_name)) view_sql = current_view.view_query shard_name = _extract_primary_shard_name(view_sql) try: self.shard = container.client.get_table( text(container.full_name + shard_name) ) self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) except Exception as e: Log.warning("view {{name}} is invalid", name=shard_name, cause=e) self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) # REMOVE STALE VIEW container.client.delete_table(current_view) # MAKE NEW VIEW POINTING TO NEW SHARD self._create_new_shard() container.create_view( self.full_name, self.container.full_name + ApiName(self.shard.table_id), ) self.last_extend = Date.now() - EXTEND_LIMIT self.extend_locker = Lock() self.extend_queue = Queue("wait for extend") def all_records(self): """ MOSTLY FOR TESTING, RETURN ALL RECORDS IN TABLE :return: """ return self.sql_query(sql_query({"from": text(self.full_name)}, self.schema)) def jx_query(self, jx_query): docs = self.sql_query( sql_query( dict_to_data({"from": text(self.full_name)}) | jx_query, self.schema ) ) data = [] for d in docs: u = untyped(from_data(leaves_to_data(d))) data.append(u) return Data(data=data, format="list") @property def schema(self): return self._flake def sql_query(self, sql): """ :param sql: SQL QUERY :return: GENERATOR OF DOCUMENTS as dict """ query_job = self.container.query_and_wait(sql) # WE WILL REACH INTO THE _flake, SINCE THIS IS THE FIRST PLACE WE ARE ACTUALLY PULLING RECORDS OUT # TODO: WITH MORE CODE THIS LOGIC GOES ELSEWHERE _ = self._flake.columns # ENSURE schema HAS BEEN PROCESSED if not self._flake._top_level_fields.keys(): for row in query_job: yield untyped(dict(row)) else: top2deep = { name: path for path, name in self._flake._top_level_fields.items() } for row in query_job: output = {} doc = dict(row) # COPY ALL BUT TOP LEVEL FIELDS for k, v in doc.items(): deep = top2deep.get(k) if deep is None: output[k] = v # INSERT TOP LEVEL FIELDS reach = wrap(output) for k, p in top2deep.items(): try: reach[p] = doc.get(k) except Exception as cause: raise cause yield untyped(output) @property def flake(self): return self._flake def _create_new_shard(self): primary_shard = self.container.create_table( table=self.short_name + "_" + "".join(randoms.sample(ALLOWED, 20)), sharded=False, schema=self._flake.schema, kwargs=self.config, ) self.shard = primary_shard.shard def extend(self, docs): self.extend_queue.extend(docs) with self.extend_locker: docs = self.extend_queue.pop_all() self._extend(docs) def _extend(self, rows): if self.read_only: Log.error("not for writing") if len(rows) == 0: return try: update = {} with Timer("encoding", verbose=DEBUG): while True: typed_rows = [] for rownum, row in enumerate(rows): typed_row, more, add_nested = typed_encode(row, self.flake) set_default(update, more) if add_nested: # row HAS NEW NESTED COLUMN! # GO OVER THE rows AGAIN SO "RECORD" GET MAPPED TO "REPEATED" DEBUG and Log.note("New nested documnet found, retrying") break typed_rows.append(typed_row) else: break if update or not self.shard: # BATCH HAS ADDITIONAL COLUMNS!! # WE CAN NOT USE THE EXISTING SHARD, MAKE A NEW ONE: self._create_new_shard() DEBUG and Log.note( "added new shard with name: {{shard}}", shard=self.shard.table_id ) with Timer( "insert {{num}} rows to bq", param={"num": len(rows)}, verbose=DEBUG ): failures = self.container.client.insert_rows_json( self.shard, json_rows=typed_rows, row_ids=[None] * len(typed_rows), skip_invalid_rows=False, ignore_unknown_values=False, ) if failures: if all(r == "stopped" for r in wrap(failures).errors.reason): self._create_new_shard() DEBUG and Log.note( "STOPPED encountered: Added new shard with name: {{shard}}", shard=self.shard.table_id, ) Log.error( "Got {{num}} failures:\n{{failures|json}}", num=len(failures), failures=failures[:5], ) else: self.last_extend = Date.now() DEBUG and Log.note("{{num}} rows added", num=len(typed_rows)) except Exception as cause: cause = Except.wrap(cause) if ( len(typed_rows) < 2 and "Your client has issued a malformed or illegal request." in cause ): Log.error( "big query complains about:\n{{data|json}}", data=typed_rows, cause=cause, ) elif len(rows) > 1 and ( "Request payload size exceeds the limit" in cause or "An existing connection was forcibly closed by the remote host" in cause or "Your client has issued a malformed or illegal request." in cause or "BrokenPipeError(32, 'Broken pipe')" in cause or "ConnectionResetError(104, 'Connection reset by peer')" in cause ): Log.warning( "problem with batch of size {{size}}", size=len(rows), cause=cause ) batch_size = ceiling(len(rows) / 10) try: DEBUG and Log.note( "attempt smaller batches of size {{batch_size}}", batch_size=batch_size, ) for _, chunk in jx.chunk(rows, batch_size): self._extend(chunk) return except Exception as cause2: Log.error( "smaller batches of size {{batch_size}} did not work", batch_size=batch_size, cause=cause2, ) elif len(rows) == 1: Log.error( "Could not insert document\n{{doc|json|indent}}", doc=rows[0], cause=cause, ) else: Log.error("Do not know how to handle", cause=cause) def add(self, row): self.extend([row]) def merge_shards(self): shards = [] tables = list(self.container.client.list_tables(self.container.dataset)) current_view = Null # VIEW THAT POINTS TO PRIMARY SHARD primary_shard_name = None # PRIMARY SHARD api_name = escape_name(self.short_name) for table_item in tables: table = table_item.reference table_api_name = ApiName(table.table_id) if text(table_api_name).startswith(text(api_name)): if table_api_name == api_name: if table_item.table_type != "VIEW": Log.error("expecting {{table}} to be a view", table=api_name) current_view = self.container.client.get_table(table) view_sql = current_view.view_query primary_shard_name = _extract_primary_shard_name(view_sql) elif SUFFIX_PATTERN.match(text(table_api_name)[len(text(api_name)) :]): try: known_table = self.container.client.get_table(table) shards.append(known_table) except Exception as e: Log.warning( "could not merge table {{table}}", table=table, cause=e ) if not current_view: Log.error( "expecting {{table}} to be a view pointing to a table", table=api_name ) shard_flakes = [ Snowflake.parse( big_query_schema=shard.schema, es_index=text(self.container.full_name + ApiName(shard.table_id)), top_level_fields=self.top_level_fields, partition=self.partition, ) for shard in shards ] total_flake = snowflakes.merge( shard_flakes, es_index=text(self.full_name), top_level_fields=self.top_level_fields, partition=self.partition, ) for i, s in enumerate(shards): if ApiName(s.table_id) == primary_shard_name: if total_flake == shard_flakes[i]: # USE THE CURRENT PRIMARY SHARD AS A DESTINATION del shards[i] del shard_flakes[i] break else: name = self.short_name + "_" + "".join(randoms.sample(ALLOWED, 20)) primary_shard_name = escape_name(name) self.container.create_table( table=name, schema=total_flake.schema, sharded=False, read_only=False, kwargs=self.config, ) primary_full_name = self.container.full_name + primary_shard_name selects = [] for flake, table in zip(shard_flakes, shards): q = ConcatSQL( SQL_SELECT, JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), gen_select(total_flake, flake)), SQL_FROM, quote_column(ApiName(table.dataset_id, table.table_id)), ) selects.append(q) DEBUG and Log.note( "inserting into table {{table}}", table=text(primary_shard_name) ) matched = [] unmatched = [] for sel, shard, flake in zip(selects, shards, shard_flakes): if flake == total_flake: matched.append((sel, shard, flake)) else: unmatched.append((sel, shard, flake)) # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL if matched: for g, merge_chunk in jx.chunk(matched, MAX_MERGE): command = ConcatSQL( SQL_INSERT, quote_column(primary_full_name), JoinSQL( SQL_UNION_ALL, ( sql_query( { "from": text( self.container.full_name + ApiName(shard.table_id) ) }, schema, ) for _, shard, schema in merge_chunk ), ), ) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) DEBUG and Log.note( "job {{id}} state = {{state}}", id=job.job_id, state=job.state ) if job.errors: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) for _, shard, _ in merge_chunk: self.container.client.delete_table(shard) # ALL OTHER SCHEMAS MISMATCH for s, shard, _ in unmatched: try: command = ConcatSQL(SQL_INSERT, quote_column(primary_full_name), s) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) DEBUG and Log.note( "from {{shard}}, job {{id}}, state {{state}}", id=job.job_id, shard=shard.table_id, state=job.state, ) if job.errors: if all( " does not have a schema." in m for m in wrap(job.errors).message ): pass # NOTHING TO DO else: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) self.container.client.delete_table(shard) except Exception as e: Log.warning("failure to merge {{shard}}", shard=shard, cause=e) # REMOVE OLD VIEW view_full_name = self.container.full_name + api_name if current_view: self.container.client.delete_table(current_view) # CREATE NEW VIEW self.container.create_view(view_full_name, primary_full_name) def condense(self): """ :return: """ # MAKE NEW SHARD partition = JoinSQL( SQL_COMMA, [ quote_column(c.es_field) for f in listwrap(self.id.field) for c in self.flake.leaves(f) ], ) order_by = JoinSQL( SQL_COMMA, [ ConcatSQL(quote_column(c.es_field), SQL_DESC) for f in listwrap(self.id.version) for c in self.flake.leaves(f) ], ) # WRAP WITH etl.timestamp BEST SELECTION self.container.query_and_wait( ConcatSQL( SQL( # SOME KEYWORDS: ROWNUM RANK "SELECT * EXCEPT (_rank) FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY " ), partition, SQL_ORDERBY, order_by, SQL(") AS _rank FROM "), quote_column(self.full_name), SQL(") a WHERE _rank=1"), ) )
class StructuredLogger_usingElasticSearch(StructuredLogger): @override def __init__(self, host, index, port=9200, type="log", max_size=1000, batch_size=100, kwargs=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ self.es = Cluster(kwargs).get_or_create_index( schema=mo_json.json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, kwargs=kwargs ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE)) Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): if params.get("template"): # DETECTED INNER TEMPLATE, ASSUME TRACE IS ON, SO DO NOT NEED THE OUTER TEMPLATE self.queue.add({"value": params}) else: template = strings.limit(template, 2000) self.queue.add({"value": {"template": template, "params": params}}, timeout=3 * MINUTE) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: Till(seconds=1).wait() messages = wrap(self.queue.pop_all()) if not messages: continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] try: for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() return scrubbed.append(_deep_json_to_string(message, depth=3)) finally: self.es.extend(scrubbed) bad_count = 0 except Exception as e: Log.warning("Problem inserting logs into ES", cause=e) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index) Till(seconds=30).wait() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=1).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e) def stop(self): with suppress_exception: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT with suppress_exception: self.queue.close()
class StructuredLogger_usingElasticSearch(StructuredLogger): @override def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds self.es = Cluster(kwargs).get_or_create_index( schema=json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, typed=True, kwargs=kwargs, ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): try: params.template = strings.limit(params.template, 2000) params.format = None self.queue.add({"value": _deep_json_to_string(params, 3)}, timeout=3 * 60) except Exception as e: sys.stdout.write(text_type(Except.wrap(e))) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: messages = wrap(self.queue.pop_all()) if not messages: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() continue try: messages = flatten_causal_chain(message.value) scrubbed.append( {"value": [_deep_json_to_string(m, depth=3) for m in messages]} ) except Exception as e: Log.warning("Problem adding to scrubbed list", cause=e) self.es.extend(scrubbed) bad_count = 0 except Exception as f: Log.warning("Problem inserting logs into ES", cause=f) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning( "Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index, ) Till(seconds=PAUSE_AFTER_BAD_INSERT).wait() self.es.flush() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e) def stop(self): with suppress_exception: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT with suppress_exception: self.queue.close() self.worker.join()
class ColumnList(Table, jx_base.Container): """ OPTIMIZED FOR THE PARTICULAR ACCESS PATTERNS USED """ def __init__(self, name): Table.__init__(self, "meta.columns") self.db_file = File("metadata." + name + ".sqlite") self.data = {} # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS) self.locker = Lock() self._schema = None self.db = sqlite3.connect( database=self.db_file.abspath, check_same_thread=False, isolation_level=None ) self.last_load = Null self.todo = Queue( "update columns to db" ) # HOLD (action, column) PAIR, WHERE action in ['insert', 'update'] self._db_load() Thread.run("update " + name, self._db_worker) @contextmanager def _db_transaction(self): self.db.execute(str("BEGIN")) try: yield self.db.execute(str("COMMIT")) except Exception as e: e = Except.wrap(e) self.db.execute(str("ROLLBACK")) Log.error("Transaction failed", cause=e) def _query(self, query): result = Data() curr = self.db.execute(query) result.meta.format = "table" result.header = [d[0] for d in curr.description] if curr.description else None result.data = curr.fetchall() return result def _db_create(self): with self._db_transaction(): self.db.execute( "CREATE TABLE " + db_table_name + sql_iso( sql_list( [ quote_column(c.name) + " " + json_type_to_sqlite_type[c.jx_type] for c in METADATA_COLUMNS ] + [ "PRIMARY KEY" + sql_iso( sql_list(map(quote_column, ["es_index", "es_column"])) ) ] ) ) ) for c in METADATA_COLUMNS: self._add(c) self._db_insert_column(c) def _db_load(self): self.last_load = Date.now() result = self._query( SQL_SELECT + "name" + SQL_FROM + "sqlite_master" + SQL_WHERE + SQL_AND.join(["name=" + db_table_name, "type=" + quote_value("table")]) ) if not result.data: self._db_create() return result = self._query( SQL_SELECT + all_columns + SQL_FROM + db_table_name + SQL_ORDERBY + sql_list(map(quote_column, ["es_index", "name", "es_column"])) ) with self.locker: for r in result.data: c = row_to_column(result.header, r) self._add(c) def _db_worker(self, please_stop): while not please_stop: try: with self._db_transaction(): result = self._query( SQL_SELECT + all_columns + SQL_FROM + db_table_name + SQL_WHERE + "last_updated > " + quote_value(self.last_load) + SQL_ORDERBY + sql_list(map(quote_column, ["es_index", "name", "es_column"])) ) with self.locker: for r in result.data: c = row_to_column(result.header, r) self._add(c) if c.last_updated > self.last_load: self.last_load = c.last_updated updates = self.todo.pop_all() DEBUG and updates and Log.note( "{{num}} columns to push to db", num=len(updates) ) for action, column in updates: while not please_stop: try: with self._db_transaction(): DEBUG and Log.note( "{{action}} db for {{table}}.{{column}}", action=action, table=column.es_index, column=column.es_column, ) if action is EXECUTE: self.db.execute(column) elif action is UPDATE: self.db.execute( "UPDATE" + db_table_name + "SET" + sql_list( [ "count=" + quote_value(column.count), "cardinality=" + quote_value(column.cardinality), "multi=" + quote_value(column.multi), "partitions=" + quote_value( value2json(column.partitions) ), "last_updated=" + quote_value(column.last_updated), ] ) + SQL_WHERE + SQL_AND.join( [ "es_index = " + quote_value(column.es_index), "es_column = " + quote_value(column.es_column), "last_updated < " + quote_value(column.last_updated), ] ) ) elif action is DELETE: self.db.execute( "DELETE FROM" + db_table_name + SQL_WHERE + SQL_AND.join( [ "es_index = " + quote_value(column.es_index), "es_column = " + quote_value(column.es_column), ] ) ) else: self._db_insert_column(column) break except Exception as e: e = Except.wrap(e) if "database is locked" in e: Log.note("metadata database is locked") Till(seconds=1).wait() break else: Log.warning("problem updataing database", cause=e) except Exception as e: Log.warning("problem updating database", cause=e) (Till(seconds=10) | please_stop).wait() def _db_insert_column(self, column): try: self.db.execute( "INSERT INTO" + db_table_name + sql_iso(all_columns) + "VALUES" + sql_iso( sql_list( [ quote_value(column[c.name]) if c.name not in ("nested_path", "partitions") else quote_value(value2json(column[c.name])) for c in METADATA_COLUMNS ] ) ) ) except Exception as e: e = Except.wrap(e) if "UNIQUE constraint failed" in e or " are not unique" in e: # THIS CAN HAPPEN BECAUSE todo HAS OLD COLUMN DATA self.todo.add((UPDATE, column), force=True) else: Log.error("do not know how to handle", cause=e) def __copy__(self): output = object.__new__(ColumnList) Table.__init__(output, "meta.columns") output.data = { t: {c: list(cs) for c, cs in dd.items()} for t, dd in self.data.items() } output.locker = Lock() output._schema = None return output def find(self, es_index, abs_column_name=None): with self.locker: if es_index.startswith("meta."): self._update_meta() if not abs_column_name: return [c for cs in self.data.get(es_index, {}).values() for c in cs] else: return self.data.get(es_index, {}).get(abs_column_name, []) def extend(self, columns): self.dirty = True with self.locker: for column in columns: self._add(column) def add(self, column): self.dirty = True with self.locker: canonical = self._add(column) if canonical == None: return column # ALREADY ADDED self.todo.add((INSERT if canonical is column else UPDATE, canonical)) return canonical def remove_table(self, table_name): del self.data[table_name] def _add(self, column): """ :param column: ANY COLUMN OBJECT :return: None IF column IS canonical ALREADY (NET-ZERO EFFECT) """ columns_for_table = self.data.setdefault(column.es_index, {}) existing_columns = columns_for_table.setdefault(column.name, []) for canonical in existing_columns: if canonical is column: return None if canonical.es_type == column.es_type: if column.last_updated > canonical.last_updated: for key in Column.__slots__: old_value = canonical[key] new_value = column[key] if new_value == None: pass # DO NOT BOTHER CLEARING OLD VALUES (LIKE cardinality AND paritiions) elif new_value == old_value: pass # NO NEED TO UPDATE WHEN NO CHANGE MADE (COMMON CASE) else: canonical[key] = new_value return canonical existing_columns.append(column) return column def _update_meta(self): if not self.dirty: return for mcl in self.data.get("meta.columns").values(): for mc in mcl: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.name] if value == None: pass else: count += 1 if is_list(value): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif is_data(value): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = Date.now() self.dirty = False def _all_columns(self): return [ column for t, cs in self.data.items() for _, css in cs.items() for column in css ] def __iter__(self): with self.locker: self._update_meta() return iter(self._all_columns()) def __len__(self): return self.data["meta.columns"]["es_index"].count def update(self, command): self.dirty = True try: command = wrap(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": with self.locker: del self.data[eq.es_index] self.todo.add( ( EXECUTE, "DELETE FROM " + db_table_name + SQL_WHERE + " es_index=" + quote_value(eq.es_index), ) ) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all( c[k] == v for k, v in eq.items() ) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": self.todo.add((DELETE, col)) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.todo.add((UPDATE, col)) except Exception as e: Log.error("should not happen", cause=e) def query(self, query): # NOT EXPECTED TO BE RUN Log.error("not") with self.locker: self._update_meta() if not self._schema: self._schema = Schema( ".", [c for cs in self.data["meta.columns"].values() for c in cs] ) snapshot = self._all_columns() from jx_python.containers.list_usingPythonList import ListContainer query.frum = ListContainer("meta.columns", snapshot, self._schema) return jx.run(query) def groupby(self, keys): with self.locker: self._update_meta() return jx.groupby(self.__iter__(), keys) @property def schema(self): if not self._schema: with self.locker: self._update_meta() self._schema = Schema( ".", [c for cs in self.data["meta.columns"].values() for c in cs] ) return self._schema @property def namespace(self): return self def get_table(self, table_name): if table_name != "meta.columns": Log.error("this container has only the meta.columns") return self def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ with self.locker: self._update_meta() output = [ { "table": c.es_index, "name": untype_path(c.name), "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "es_type": c.es_type, "type": c.jx_type, } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.jx_type not in STRUCT # and c.es_column != "_id" ] from jx_python.containers.list_usingPythonList import ListContainer return ListContainer( self.name, data=output, schema=jx_base.Schema("meta.columns", SIMPLE_METADATA_COLUMNS), )
class StructuredLogger_usingElasticSearch(StructuredLogger): @override def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] schema = json2value(value2json(SCHEMA), leaves=True) schema.mappings[type].properties["~N~"].type = "nested" self.es = Cluster(kwargs).get_or_create_index( schema=schema, limit_replicas=True, typed=True, kwargs=kwargs, ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): try: params.template = strings.limit(params.template, 2000) params.format = None self.queue.add({"value": _deep_json_to_string(params, 3)}, timeout=3 * 60) except Exception as e: sys.stdout.write(text_type(Except.wrap(e))) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: messages = wrap(self.queue.pop_all()) if not messages: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() continue try: messages = flatten_causal_chain(message.value) scrubbed.append( { "value": [ _deep_json_to_string(m, depth=3) for m in messages ] } ) except Exception as e: Log.warning("Problem adding to scrubbed list", cause=e) self.es.extend(scrubbed) bad_count = 0 except Exception as f: Log.warning("Problem inserting logs into ES", cause=f) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning( "Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index, ) Till(seconds=PAUSE_AFTER_BAD_INSERT).wait() self.es.flush() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e) def stop(self): with suppress_exception: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT with suppress_exception: self.queue.close() self.worker.join()