def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None): self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD) self.rate = coalesce(rate, HG_REQUEST_PER_SECOND) self.cache_locker = Lock() self.cache = {} # MAP FROM url TO (ready, headers, response, timestamp) PAIR self.no_cache = {} # VERY SHORT TERM CACHE self.workers = [] self.todo = Queue(APP_NAME+" todo") self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds)) self.url = URL(source.url) self.db = Sqlite(database) self.inbound_rate = RateLogger("Inbound") self.outbound_rate = RateLogger("hg.mo") if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data: with self.db.transaction() as t: t.execute( "CREATE TABLE cache (" " path TEXT PRIMARY KEY, " " headers TEXT, " " response TEXT, " " timestamp REAL " ")" ) self.threads = [ Thread.run(APP_NAME+" worker" + text_type(i), self._worker) for i in range(CONCURRENCY) ] self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter) self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner)
def _find_revision(self, revision): please_stop = False locker = Lock() output = [] queue = Queue("branches", max=2000) queue.extend(b for b in self.branches if b.locale == DEFAULT_LOCALE and b.name in ["try", "mozilla-inbound", "autoland"]) queue.add(THREAD_STOP) problems = [] def _find(please_stop): for b in queue: if please_stop: return try: url = b.url + "json-info?node=" + revision rev = self.get_revision(Revision(branch=b, changeset={"id": revision})) with locker: output.append(rev) Log.note("Revision found at {{url}}", url=url) except Exception as f: problems.append(f) threads = [] for i in range(3): threads.append(Thread.run("find changeset " + text_type(i), _find, please_stop=please_stop)) for t in threads: with assert_no_exception: t.join() return output
def __init__(self, stream): assert stream if is_text(stream): name = stream stream = self.stream = eval(stream) if name.startswith("sys.") and PY3: self.stream = Data(write=lambda d: stream.write(d.decode('utf8'))) else: name = "stream" self.stream = stream # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD from mo_threads import Queue def utf8_appender(value): if is_text(value): value = value.encode('utf8') self.stream.write(value) appender = utf8_appender self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True) self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start()
def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds self.es = Cluster(kwargs).get_or_create_index( schema=json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, typed=True, kwargs=kwargs, ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def __init__(self, logger): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True) self.logger = logger def worker(logger, please_stop): try: while not please_stop: Till(seconds=1).wait() logs = self.queue.pop_all() for log in logs: if log is THREAD_STOP: please_stop.go() else: logger.write(**log) finally: logger.stop() self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger) self.thread.parent.remove_child( self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start()
class StructuredLogger_usingQueue(StructuredLogger): def __init__(self, name=None): queue_name = "log messages to queue" if name: queue_name += " " + name self.queue = Queue(queue_name) def write(self, template, params): self.queue.add(expand_template(template, params)) def stop(self): self.queue.close() def pop(self): lines = self.queue.pop() output = [] for l in lines.split(CR): # REMOVE FIRST PART, THE TIMESTAMP # 0123456789012345678901234567890 # 2019-01-06 19:13:49.937542 - prefix = re.match(DATE_PATTERN, l) if prefix: l = l[len(prefix.group(0)):] if not l.strip(): continue if l.strip().startswith("File"): continue output.append(l) return CR.join(output).strip()
def __init__(self, stream): assert stream use_UTF8 = False if isinstance(stream, basestring): if stream.startswith("sys."): use_UTF8 = True # sys.* ARE OLD AND CAN NOT HANDLE unicode self.stream = eval(stream) name = stream else: self.stream = stream name = "stream" # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD from mo_threads import Queue if use_UTF8: def utf8_appender(value): if isinstance(value, unicode): value = value.encode('utf8') self.stream.write(value) appender = utf8_appender else: appender = self.stream.write self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True) self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start()
class StructuredLogger_usingThread(StructuredLogger): def __init__(self, logger, period=PERIOD): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.logger = logger self.queue = Queue( "Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True, ) self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger, self.queue, period) # worker WILL BE RESPONSIBLE FOR THREAD stop() self.thread.parent.remove_child(self.thread) self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: e = Except.wrap(e) raise e # OH NO! def stop(self): try: self.queue.add( THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() except Exception as e: Log.note("problem in threaded logger" + str(e))
def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(kwargs=kwargs) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def __init__(self, conn=None, tuid_service=None, start_workers=True, new_table=False, kwargs=None): try: self.config = kwargs self.conn = conn if conn else sql.Sql(self.config.database.name) self.hg_cache = HgMozillaOrg(kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService( kwargs=self.config.tuid, conn=self.conn, clogger=self ) self.rev_locker = Lock() self.working_locker = Lock() if new_table: with self.conn.transaction() as t: t.execute("DROP TABLE IF EXISTS csetLog") self.init_db() self.next_revnum = coalesce(self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1) self.csets_todo_backwards = Queue(name="Clogger.csets_todo_backwards") self.deletions_todo = Queue(name="Clogger.deletions_todo") self.maintenance_signal = Signal(name="Clogger.maintenance_signal") if 'tuid' in self.config: self.config = self.config.tuid self.disable_backfilling = False self.disable_tipfilling = False self.disable_deletion = False self.disable_maintenance = False self.backfill_thread = None self.tipfill_thread = None self.deletion_thread = None self.maintenance_thread = None # Make sure we are filled before allowing queries numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] if numrevs < MINIMUM_PERMANENT_CSETS: Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS) oldest_rev = 'tip' with self.conn.transaction() as t: tmp = t.query("SELECT min(revnum), revision FROM csetLog").data[0][1] if tmp: oldest_rev = tmp self._fill_in_range( MINIMUM_PERMANENT_CSETS - numrevs, oldest_rev, timestamp=False ) Log.note( "Table is filled with atleast {{minim}} entries.", minim=MINIMUM_PERMANENT_CSETS ) if start_workers: self.start_workers() except Exception as e: Log.warning("Cannot setup clogger: {{cause}}", cause=str(e))
def update_local_database(config, deviant_summary, candidates, since): if isinstance(deviant_summary, bigquery.Table): Log.note("Only the ETL process should fill the bigquery table") return # GET EVERYTHING WE HAVE SO FAR exists = deviant_summary.query({ "select": ["signature_hash", "last_updated"], "where": { "and": [ { "in": { "signature_hash": candidates.signature_hash } }, { "exists": "num_pushes" }, ] }, "sort": "last_updated", "limit": 100000, "format": "list", }).data # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT" missing = list(set(candidates.signature_hash) - set(exists.signature_hash)) too_old = Date.today() - parse(LOCAL_RETENTION) needs_update = missing + [ e.signature_hash for e in exists if e.last_updated < too_old.unix ] Log.alert("{{num}} series are candidates for local update", num=len(needs_update)) limited_update = Queue("sigs") limited_update.extend( left(needs_update, coalesce(config.display.download_limit, 100))) Log.alert("Updating local database with {{num}} series", num=len(limited_update)) with Timer("Updating local database"): def loop(please_stop): while not please_stop: signature_hash = limited_update.pop_one() if not signature_hash: return process( signature_hash, since, source=config.database, deviant_summary=deviant_summary, ) threads = [Thread.run(text(i), loop) for i in range(3)] for t in threads: t.join() Log.note("Local database is up to date")
def __init__(self, conn=None, tuid_service=None, kwargs=None): try: self.config = kwargs self.conn = conn if conn else sql.Sql(self.config.database.name) self.hg_cache = HgMozillaOrg( kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService( database=None, hg=None, kwargs=self.config, conn=self.conn, clogger=self) self.rev_locker = Lock() self.working_locker = Lock() self.init_db() self.next_revnum = coalesce( self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1) self.csets_todo_backwards = Queue( name="Clogger.csets_todo_backwards") self.deletions_todo = Queue(name="Clogger.deletions_todo") self.maintenance_signal = Signal(name="Clogger.maintenance_signal") self.config = self.config.tuid self.disable_backfilling = False self.disable_tipfilling = False self.disable_deletion = False self.disable_maintenance = False # Make sure we are filled before allowing queries numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] if numrevs < MINIMUM_PERMANENT_CSETS: Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS) oldest_rev = 'tip' with self.conn.transaction() as t: tmp = t.query( "SELECT min(revnum), revision FROM csetLog").data[0][1] if tmp: oldest_rev = tmp self._fill_in_range(MINIMUM_PERMANENT_CSETS - numrevs, oldest_rev, timestamp=False) Log.note( "Table is filled with atleast {{minim}} entries. Starting workers...", minim=MINIMUM_PERMANENT_CSETS) Thread.run('clogger-tip', self.fill_forward_continuous) Thread.run('clogger-backfill', self.fill_backward_with_list) Thread.run('clogger-maintenance', self.csetLog_maintenance) Thread.run('clogger-deleter', self.csetLog_deleter) Log.note("Started clogger workers.") except Exception as e: Log.warning("Cannot setup clogger: {{cause}}", cause=str(e))
def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, refresh_interval="1second", kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = randoms.sample(listwrap(host), 1)[0] rollover_interval = coalesce(kwargs.rollover.interval, kwargs.rollover.max, "year") rollover_max = coalesce(kwargs.rollover.max, kwargs.rollover.interval, "year") schema = set_default( kwargs.schema, { "mappings": { kwargs.type: { "properties": { "~N~": { "type": "nested" } } } } }, json2value(value2json(SCHEMA), leaves=True), ) self.es = RolloverIndex( rollover_field={"get": [{ "first": "." }, { "literal": "timestamp" }]}, rollover_interval=rollover_interval, rollover_max=rollover_max, schema=schema, limit_replicas=True, typed=True, read_only=False, kwargs=kwargs, ) self.batch_size = batch_size self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def __init__(self, filename=None, db=None): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread save database """ if not _upgraded: _upgrade() self.filename = filename self.db = db self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG
def __init__(self, db): Table.__init__(self, META_COLUMNS_NAME) self.data = {} # MAP FROM fact_name TO (abs_column_name to COLUMNS) self.locker = Lock() self._schema = None self.dirty = False self.db = db self.es_index = None self.last_load = Null self.todo = Queue( "update columns to es" ) # HOLD (action, column) PAIR, WHERE action in ['insert', 'update'] self._snowflakes = Data() self._load_from_database()
def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = Relation_usingList() self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList() self.alias_to_query_paths = { "meta.columns": [['.']], "meta.tables": [['.']] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [ # TableDesc("meta.columns", None, ".", Date.now()), # TableDesc("meta.tables", None, ".", Date.now()) ], jx_base.Schema(".", table_columns)) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def __init__(self, filename=None, db=None, get_trace=None, upgrade=True, load_functions=False, kwargs=None): """ :param filename: FILE TO USE FOR DATABASE :param db: AN EXISTING sqlite3 DB YOU WOULD LIKE TO USE (INSTEAD OF USING filename) :param get_trace: GET THE STACK TRACE AND THREAD FOR EVERY DB COMMAND (GOOD FOR DEBUGGING) :param upgrade: REPLACE PYTHON sqlite3 DLL WITH MORE RECENT ONE, WITH MORE FUNCTIONS (NOT WORKING) :param load_functions: LOAD EXTENDED MATH FUNCTIONS (MAY REQUIRE upgrade) :param kwargs: """ if upgrade and not _upgraded: _upgrade() self.settings = kwargs self.filename = File(filename).abspath if known_databases.get(self.filename): Log.error("Not allowed to create more than one Sqlite instance for {{file}}", file=self.filename) # SETUP DATABASE DEBUG and Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version) try: if db == None: self.db = sqlite3.connect( database=coalesce(self.filename, ":memory:"), check_same_thread=False, isolation_level=None ) else: self.db = db except Exception as e: Log.error("could not open file {{filename}}", filename=self.filename, cause=e) load_functions and self._load_functions() self.locker = Lock() self.available_transactions = [] # LIST OF ALL THE TRANSACTIONS BEING MANAGED self.queue = Queue("sql commands") # HOLD (command, result, signal, stacktrace) TUPLES self.get_trace = coalesce(get_trace, TRACE) self.upgrade = upgrade self.closed = False # WORKER VARIABLES self.transaction_stack = [] # THE TRANSACTION OBJECT WE HAVE PARTIALLY RUN self.last_command_item = None # USE THIS TO HELP BLAME current_transaction FOR HANGING ON TOO LONG self.too_long = None self.delayed_queries = [] self.delayed_transactions = [] self.worker = Thread.run("sqlite db thread", self._worker) DEBUG and Log.note("Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0])
def __init__(self, name): Table.__init__(self, "meta.columns") self.db_file = File("metadata." + name + ".sqlite") self.data = {} # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS) self.locker = Lock() self._schema = None self.db = sqlite3.connect( database=self.db_file.abspath, check_same_thread=False, isolation_level=None ) self.last_load = Null self.todo = Queue( "update columns to db" ) # HOLD (action, column) PAIR, WHERE action in ['insert', 'update'] self._db_load() Thread.run("update " + name, self._db_worker)
def __init__(self, logger): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True) self.logger = logger def worker(logger, please_stop): try: while not please_stop: logs = self.queue.pop_all() if not logs: (Till(seconds=1) | please_stop).wait() continue for log in logs: if log is THREAD_STOP: please_stop.go() else: logger.write(**log) except Exception as e: print("problem in " + StructuredLogger_usingThread.__name__ + ": " + str(e)) finally: Log.note("stop the child") logger.stop() self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start()
def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] schema = json2value(value2json(SCHEMA), leaves=True) schema.mappings[type].properties["~N~"].type = "nested" self.es = Cluster(kwargs).get_or_create_index( schema=schema, limit_replicas=True, typed=True, kwargs=kwargs, ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def stop(cls): """ DECONSTRUCTS ANY LOGGING, AND RETURNS TO DIRECT-TO-stdout LOGGING EXECUTING MULUTIPLE TIMES IN A ROW IS SAFE, IT HAS NO NET EFFECT, IT STILL LOGS TO stdout :return: NOTHING """ from mo_threads import profiles if cls.cprofiler and hasattr(cls, "settings"): if cls.cprofiler == None: from mo_threads import Queue cls.cprofiler_stats = Queue( "cprofiler stats" ) # ACCUMULATION OF STATS FROM ALL THREADS import pstats cls.cprofiler_stats.add(pstats.Stats(cls.cprofiler)) write_profile(cls.settings.cprofile, cls.cprofiler_stats.pop_all()) if profiles.ON and hasattr(cls, "settings"): profiles.write(cls.settings.profile) cls.main_log.stop() cls.main_log = StructuredLogger_usingStream(sys.stdout)
def __init__(self, logger, period=PERIOD): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.logger = logger self.queue = Queue( "Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True, ) self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger, self.queue, period) # worker WILL BE RESPONSIBLE FOR THREAD stop() self.thread.parent.remove_child(self.thread) self.thread.start()
def __init__(self, es_cluster): Table.__init__(self, META_COLUMNS_NAME) self.data = {} # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS) self.locker = Lock() self._schema = None self.dirty = False self.es_cluster = es_cluster self.es_index = None self.last_load = Null self.todo = Queue( "update columns to es" ) # HOLD (action, column) PAIR, WHERE action in ['insert', 'update'] self._db_load() Thread.run("update " + META_COLUMNS_NAME, self._update_from_es, parent_thread=MAIN_THREAD)
def __init__(self, stream): assert stream if isinstance(stream, text_type): name = stream stream = self.stream = eval(stream) if name.startswith("sys.") and PY3: self.stream = Data(write=lambda d: stream.write(d.decode('utf8'))) else: name = "stream" self.stream = stream # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD from mo_threads import Queue def utf8_appender(value): if isinstance(value, text_type): value = value.encode('utf8') self.stream.write(value) appender = utf8_appender self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True) self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start()
def update_local_database(): # GET EVERYTHING WE HAVE SO FAR exists = summary_table.query({ "select": ["id", "last_updated"], "where": { "and": [{ "in": { "id": candidates.id } }, { "exists": "num_pushes" }] }, "sort": "last_updated", "limit": 100000, "format": "list", }).data # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT" missing = list(set(candidates.id) - set(exists.id)) too_old = Date.today() - parse(LOCAL_RETENTION) needs_update = missing + [ e for e in exists if e.last_updated < too_old.unix ] Log.alert("{{num}} series are candidates for local update", num=len(needs_update)) limited_update = Queue("sigs") limited_update.extend( left(needs_update, coalesce(config.analysis.download_limit, 100))) Log.alert("Updating local database with {{num}} series", num=len(limited_update)) with Timer("Updating local database"): def loop(please_stop): while not please_stop: sig_id = limited_update.pop_one() if not sig_id: return process(sig_id) threads = [Thread.run(text(i), loop) for i in range(3)] for t in threads: t.join() Log.note("Local database is up to date")
def __init__(self, filename=None, db=None, upgrade=True, load_functions=False, kwargs=None): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread-safe database """ if upgrade and not _upgraded: _upgrade() self.settings = kwargs self.filename = File(filename).abspath self.db = db self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = TRACE self.upgrade = upgrade self.closed = False if DEBUG: Log.note("Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0])
def _late_import(): global _Log from mo_logs import Log as _Log from mo_threads import Queue if _Log.cprofiler_stats == None: _Log.cprofiler_stats = Queue( "cprofiler stats") # ACCUMULATION OF STATS FROM ALL THREADS
def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.settings = kwargs self.too_old = TOO_OLD self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.meta = Data() self.meta.columns = ColumnList(self.es_cluster) self.meta.columns.extend(META_TABLES_DESC.columns) self.meta.tables = ListContainer(META_TABLES_NAME, [], jx_base.Schema(".", META_TABLES_DESC.columns)) self.meta.table.extend([META_COLUMNS_DESC, META_TABLES_DESC]) self.alias_to_query_paths = {} for i, settings in self.es_cluster.get_metadata().indices.items(): if len(settings.aliases) == 0: alias = i elif len(settings.aliases) == 1: alias = first(settings.aliases) else: Log.error("expecting only one alias per index") desc = TableDesc( name=alias, url=None, query_path=ROOT_PATH, last_updated=Date.MIN, columns=[] ) self.meta.tables.add(desc) self.alias_to_query_paths[alias] = [desc.query_path] self.alias_to_query_paths[self._find_alias(alias)] = [desc.query_path] # WE MUST PAUSE? # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("not refresh metadata", self.not_monitor) return
class StructuredLogger_usingThread(StructuredLogger): def __init__(self, logger): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True) self.logger = logger def worker(logger, please_stop): try: while not please_stop: logs = self.queue.pop_all() if not logs: (Till(seconds=1) | please_stop).wait() continue for log in logs: if log is THREAD_STOP: please_stop.go() else: logger.write(**log) except Exception as e: print("problem in " + StructuredLogger_usingThread.__name__ + ": " + str(e)) finally: Log.note("stop the child") logger.stop() self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger) self.thread.parent.remove_child( self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: e = Except.wrap(e) raise e # OH NO! def stop(self): try: self.queue.add( THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() Log.note("joined on thread") except Exception as e: Log.note("problem in threaded logger" + str(e)) with suppress_exception: self.queue.close()
def __init__(self, filename=None, db=None, upgrade=True): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread-safe database """ if upgrade and not _upgraded: _upgrade() self.filename = filename self.db = db self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG self.upgrade = upgrade
class StructuredLogger_usingQueue(StructuredLogger): def __init__(self, name=None): queue_name = "log messages to queue" if name: queue_name += " " + name self.queue = Queue(queue_name) def write(self, template, params): self.queue.add(expand_template(template, params)) def stop(self): self.queue.close() def pop(self): lines = self.queue.pop() output = [] for l in lines.split("\n"): if l[19:22] == " - ": l = l[22:] if l.strip().startswith("File"): continue output.append(l) return "\n".join(output).strip()
def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.abs_columns = set() self.last_es_metadata = Date.now() - OLD_METADATA self.meta = Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def __init__( self, name, broker=None, include=None, **kwargs ): self.Task = MethodCaller self.name = name self.request_queue = Queue(name=name+" requests") self.response_queue = Queue(name=name+" responses") self.kwargs = kwargs self.include = include self.broker = broker self._config = {} self._tasks = {} self.on_init() self.response_worker = Thread.run("response worker", self._response_worker) self.responses = {} self.responses_lock = Lock() self.id_lock = Lock() self.next_id = 1 self.worker = Worker(self.request_queue, self.response_queue, celery=self)
class StructuredLogger_usingQueue(StructuredLogger): def __init__(self, name=None): queue_name = "log messages to queue" if name: queue_name += " "+name self.queue = Queue(queue_name) def write(self, template, params): self.queue.add(expand_template(template, params)) def stop(self): self.queue.close() def pop(self): lines = self.queue.pop() output = [] for l in lines.split("\n"): if l[19:22] == " - ": l = l[22:] if l.strip().startswith("File"): continue output.append(l) return "\n".join(output).strip()
def __init__(self, host, index, port=9200, type="log", max_size=1000, batch_size=100, kwargs=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ self.es = Cluster(kwargs).get_or_create_index( schema=mo_json.json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, kwargs=kwargs ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE)) Thread.run("add debug logs to es", self._insert_loop)
class StructuredLogger_usingThreadedStream(StructuredLogger): # stream CAN BE AN OBJCET WITH write() METHOD, OR A STRING # WHICH WILL eval() TO ONE def __init__(self, stream): assert stream use_UTF8 = False if isinstance(stream, basestring): if stream.startswith("sys."): use_UTF8 = True # sys.* ARE OLD AND CAN NOT HANDLE unicode self.stream = eval(stream) name = stream else: self.stream = stream name = "stream" # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD from mo_threads import Queue if use_UTF8: def utf8_appender(value): if isinstance(value, unicode): value = value.encode('utf8') self.stream.write(value) appender = utf8_appender else: appender = self.stream.write self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True) self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: raise e # OH NO! def stop(self): try: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() except Exception as e: if DEBUG_LOGGING: raise e try: self.queue.close() except Exception, f: if DEBUG_LOGGING: raise f
class StructuredLogger_usingThread(StructuredLogger): def __init__(self, logger): if not isinstance(logger, StructuredLogger): Log.error("Expecting a StructuredLogger") self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True) self.logger = logger def worker(logger, please_stop): try: while not please_stop: logs = self.queue.pop_all() if not logs: (Till(seconds=1) | please_stop).wait() continue for log in logs: if log is THREAD_STOP: please_stop.go() else: logger.write(**log) except Exception as e: print("problem in " + StructuredLogger_usingThread.__name__ + ": " + str(e)) finally: Log.note("stop the child") logger.stop() self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: e = Except.wrap(e) raise e # OH NO! def stop(self): try: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() Log.note("joined on thread") except Exception as e: Log.note("problem in threaded logger" + str(e)) with suppress_exception: self.queue.close()
def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = Relation_usingList() self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList() self.alias_to_query_paths = { "meta.columns": [['.']], "meta.tables": [['.']] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [ # TableDesc("meta.columns", None, ".", Date.now()), # TableDesc("meta.tables", None, ".", Date.now()) ], jx_base.Schema(".", table_columns) ) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def stop(cls): from mo_logs import profiles if cls.cprofiler and hasattr(cls, "settings"): if cls.cprofiler == None: from mo_threads import Queue cls.cprofiler_stats = Queue( "cprofiler stats" ) # ACCUMULATION OF STATS FROM ALL THREADS import pstats cls.cprofiler_stats.add(pstats.Stats(cls.cprofiler)) write_profile(cls.settings.cprofile, cls.cprofiler_stats.pop_all()) if profiles.ON and hasattr(cls, "settings"): profiles.write(cls.settings.profile) cls.main_log.stop() cls.main_log = StructuredLogger_usingStream(sys.stdout)
class StructuredLogger_usingThreadedStream(StructuredLogger): # stream CAN BE AN OBJCET WITH write() METHOD, OR A STRING # WHICH WILL eval() TO ONE def __init__(self, stream): assert stream if isinstance(stream, text_type): name = stream stream = self.stream = eval(stream) if name.startswith("sys.") and PY3: self.stream = Data(write=lambda d: stream.write(d.decode('utf8'))) else: name = "stream" self.stream = stream # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD from mo_threads import Queue def utf8_appender(value): if isinstance(value, text_type): value = value.encode('utf8') self.stream.write(value) appender = utf8_appender self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True) self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception as e: raise e # OH NO! def stop(self): try: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT self.thread.join() except Exception as e: if DEBUG_LOGGING: raise e try: self.queue.close() except Exception as f: if DEBUG_LOGGING: raise f
def __init__(self, name=None): queue_name = "log messages to queue" if name: queue_name += " "+name self.queue = Queue(queue_name)
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ @override def __new__(cls, kwargs, *args, **_kwargs): es_cluster = elasticsearch.Cluster(kwargs) output = known_clusters.get(id(es_cluster)) if output is None: output = object.__new__(cls) known_clusters[id(es_cluster)] = output return output @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = {} self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList(URL(self.es_cluster.settings.host).host) self.alias_to_query_paths = { "meta.columns": [ROOT_PATH], "meta.tables": [ROOT_PATH] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [], jx_base.Schema(".", table_columns) ) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("not refresh metadata", self.not_monitor) return @property def namespace(self): return self.meta.columns.namespace @property def url(self): return self.es_cluster.url / self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index es_metadata_update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata(force=es_metadata_update_required) props = [ (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if alias in d.aliases for t, m in [_get_best_type_from_mapping(d.mappings)] ] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} columns = self._parse_properties(alias, mapping) table_desc.timestamp = es_last_updated return columns def _parse_properties(self, alias, mapping): abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties) if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns): Log.warning( "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}", url=self.es_cluster.url, index=alias, names=[ ".".join((c.es_index, c.name)) for c in abs_columns if c.cardinality == 0 ] ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ENSURE ALL TABLES HAVE THE QUERY PATHS SET self.alias_to_query_paths[alias] = query_paths for i, a in self.index_to_alias.items(): if a == alias: self.alias_to_query_paths[i] = query_paths # ENSURE COLUMN HAS CORRECT jx_type # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE output = [] best = {} for abs_column in abs_columns: abs_column.jx_type = jx_type(abs_column) if abs_column.jx_type not in STRUCT: clean_name = unnest_path(abs_column.name) other = best.get(clean_name) if other: if len(other.nested_path) < len(abs_column.nested_path): output.remove(other) self.meta.columns.update({"clear": ".", "where": {"eq": {"es_column": other.es_column, "es_index": other.es_index}}}) else: continue best[clean_name] = abs_column output.append(abs_column) # REGISTER ALL COLUMNS canonicals = [] for abs_column in output: canonical = self.meta.columns.add(abs_column) canonicals.append(canonical) self.todo.extend(canonicals) return canonicals def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias.get(name) def get_columns(self, table_name, column_name=None, after=None, timeout=None): """ RETURN METADATA COLUMNS :param table_name: TABLE WE WANT COLUMNS FOR :param column_name: OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME :param timeout: Signal; True when should give up :return: """ DEBUG and after and Log.note("getting columns for after {{time}}", time=after) table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc( name=alias, url=None, query_path=["."], timestamp=Date.MIN ) with self.meta.tables.locker: self.meta.tables.add(table) columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") elif after or table.timestamp < self.es_cluster.metatdata_last_updated: columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") else: columns = self.meta.columns.find(alias, column_name) DEBUG and Log.note("columns from find()") DEBUG and Log.note("columns are {{ids}}", ids=[id(c) for c in columns]) columns = jx.sort(columns, "name") if after is None: return columns # DO NOT WAIT FOR COMPLETE COLUMNS # WAIT FOR THE COLUMNS TO UPDATE while True: pending = [c for c in columns if after >= c.last_updated or (c.cardinality == None and c.jx_type not in STRUCT)] if not pending: break if timeout: Log.error("trying to gets columns timed out") if DEBUG: if len(pending) > 10: Log.note("waiting for {{num}} columns to update by {{timestamp}}", num=len(pending), timestamp=after) else: Log.note("waiting for columns to update by {{timestamp}}; {{columns|json}}", timestamp=after, columns=[c.es_index + "." + c.es_column + " id="+text_type(id(c)) for c in pending]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Failure to get columns for {{table}}", table=table_name, cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ now = Date.now() if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": [False, True], "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return else: es_query = { "aggs": { "count": _counting_query(column), "_filter": { "aggs": {"multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}}, "filter": {"bool": {"should": [ {"range": {"etl.timestamp.~n~": {"gte": (Date.today() - WEEK)}}}, {"bool": {"must_not": {"exists": {"field": "etl.timestamp.~n~"}}}} ]}} } }, "size": 0 } result = self.es_cluster.post("/" + es_index + "/_search", data=es_query) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results._filter.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: # WHEN DOES THIS HAPPEN? query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) DEBUG and Log.note("update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index: # WE EXPECT TEST TABLES TO DISAPPEAR Log.warning("Missing index {{col.es_index}}", col=column, cause=e) self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) elif "No field found for" in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) else: self.meta.columns.update({ "set": { "last_updated": now }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[Date(t).format() for t in wrap(old_columns).last_updated] ) self.todo.extend(old_columns) else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG): if column.es_index in self.index_does_not_exist: DEBUG and Log.note("{{column.es_column}} does not exist", column=column) self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) continue if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) else: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: column = self.todo.pop() if column == THREAD_STOP: break if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if untype_path(column.name) in ["build.type", "run.type"]: try: self._update_cardinality(column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) else: column.last_updated = Date.now() def get_table(self, name): if name == "meta.columns": return self.meta.columns with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema if name == "meta.tables": return self.meta.tables root, rest = tail_field(name) return self.get_snowflake(root).get_schema(rest)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @override def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(kwargs=kwargs) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.es_index, c.names["."]) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: if DEBUG: Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical is not c: set_default(c.names, canonical.names) for key in Column.__slots__: canonical[key] = c[key] if DEBUG: Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE table_path = split_field(table) es_index = table_path[0] query_path = join_field(table_path[1:]) meta = self.es_metadata.indices[es_index] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[es_index] for _, properties in meta.mappings.items(): properties.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") and r.es_column.find("=") == -1 and r.es_column.find(" ") == -1 ) def add_column(c, query_path): c.last_updated = Date.now() if query_path[0] != ".": c.names[query_path[0]] = relative_field(c.names["."], query_path[0]) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.es_index = alias self._upsert_column(c) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: for query_path in query_paths: add_column(abs_column, query_path) def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field(query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table( name=es_index_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name) def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in STRUCT: Log.error("not supported") try: if c.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return if c.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return es_index = c.es_index.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": {c.names["."]: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None) if cardinality == None: Log.error("logic error") query = Data(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif len(c.nested_path) != 1: query.aggs[literal_field(c.names["."])] = { "nested": {"path": c.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}} } else: query.aggs[literal_field(c.names["."])] = {"terms": {"field": c.es_column, "size": 0}} result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) except Exception as e: if "IndexMissingException" in e and c.es_index.startswith(TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear":[ "partitions" ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": {"eq": {"names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: with self.meta.columns.locker: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT ] if old_columns: if DEBUG: Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: if DEBUG: Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if DEBUG: Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column.type in STRUCT: with self.meta.columns.locker: column.last_updated = Date.now() continue elif column.last_updated >= Date.now()-TOO_OLD: continue try: self._update_cardinality(column) if DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX): Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD: continue with self.meta.columns.locker: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear":[ "count", "cardinality", "partitions", ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) if DEBUG: Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
class Cache(object): """ For Caching hg.mo requests """ @override def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None): self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD) self.rate = coalesce(rate, HG_REQUEST_PER_SECOND) self.cache_locker = Lock() self.cache = {} # MAP FROM url TO (ready, headers, response, timestamp) PAIR self.no_cache = {} # VERY SHORT TERM CACHE self.workers = [] self.todo = Queue(APP_NAME+" todo") self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds)) self.url = URL(source.url) self.db = Sqlite(database) self.inbound_rate = RateLogger("Inbound") self.outbound_rate = RateLogger("hg.mo") if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data: with self.db.transaction() as t: t.execute( "CREATE TABLE cache (" " path TEXT PRIMARY KEY, " " headers TEXT, " " response TEXT, " " timestamp REAL " ")" ) self.threads = [ Thread.run(APP_NAME+" worker" + text_type(i), self._worker) for i in range(CONCURRENCY) ] self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter) self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner) def _rate_limiter(self, please_stop): try: max_requests = self.requests.max recent_requests = [] while not please_stop: now = Date.now() too_old = now - self.amortization_period recent_requests = [t for t in recent_requests if t > too_old] num_recent = len(recent_requests) if num_recent >= max_requests: space_free_at = recent_requests[0] + self.amortization_period (please_stop | Till(till=space_free_at.unix)).wait() continue for _ in xrange(num_recent, max_requests): request = self.todo.pop() now = Date.now() recent_requests.append(now) self.requests.add(request) except Exception as e: Log.warning("failure", cause=e) def _cache_cleaner(self, please_stop): while not please_stop: now = Date.now() too_old = now-CACHE_RETENTION remove = set() with self.cache_locker: for path, (ready, headers, response, timestamp) in self.cache: if timestamp < too_old: remove.add(path) for r in remove: del self.cache[r] (please_stop | Till(seconds=CACHE_RETENTION.seconds / 2)).wait() def please_cache(self, path): """ :return: False if `path` is not to be cached """ if path.endswith("/tip"): return False if any(k in path for k in ["/json-annotate/", "/json-info/", "/json-log/", "/json-rev/", "/rev/", "/raw-rev/", "/raw-file/", "/json-pushes", "/pushloghtml", "/file/"]): return True return False def request(self, method, path, headers): now = Date.now() self.inbound_rate.add(now) ready = Signal(path) # TEST CACHE with self.cache_locker: pair = self.cache.get(path) if pair is None: self.cache[path] = (ready, None, None, now) if pair is not None: # REQUEST IS IN THE QUEUE ALREADY, WAIT ready, headers, response, then = pair if response is None: ready.wait() with self.cache_locker: ready, headers, response, timestamp = self.cache.get(path) with self.db.transaction() as t: t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now)) return Response( response, status=200, headers=json.loads(headers) ) # TEST DB db_response = self.db.query("SELECT headers, response FROM cache WHERE path=" + quote_value(path)).data if db_response: headers, response = db_response[0] with self.db.transaction() as t: t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now)) with self.cache_locker: self.cache[path] = (ready, headers, response.encode('latin1'), now) ready.go() return Response( response, status=200, headers=json.loads(headers) ) # MAKE A NETWORK REQUEST self.todo.add((ready, method, path, headers, now)) ready.wait() with self.cache_locker: ready, headers, response, timestamp = self.cache[path] return Response( response, status=200, headers=json.loads(headers) ) def _worker(self, please_stop): while not please_stop: pair = self.requests.pop(till=please_stop) if please_stop: break ready, method, path, req_headers, timestamp = pair try: url = self.url / path self.outbound_rate.add(Date.now()) response = http.request(method, url, req_headers) del response.headers['transfer-encoding'] resp_headers = value2json(response.headers) resp_content = response.raw.read() please_cache = self.please_cache(path) if please_cache: with self.db.transaction() as t: t.execute("INSERT INTO cache (path, headers, response, timestamp) VALUES" + quote_list((path, resp_headers, resp_content.decode('latin1'), timestamp))) with self.cache_locker: self.cache[path] = (ready, resp_headers, resp_content, timestamp) except Exception as e: Log.warning("problem with request to {{path}}", path=path, cause=e) with self.cache_locker: ready, headers, response = self.cache[path] del self.cache[path] finally: ready.go()
class Sqlite(DB): """ Allows multi-threaded access Loads extension functions (like SQRT) """ canonical = None def __init__(self, filename=None, db=None, upgrade=True): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread-safe database """ if upgrade and not _upgraded: _upgrade() self.filename = filename self.db = db self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG self.upgrade = upgrade def _enhancements(self): def regex(pattern, value): return 1 if re.match(pattern+"$", value) else 0 con = self.db.create_function("regex", 2, regex) class Percentile(object): def __init__(self, percentile): self.percentile=percentile self.acc=[] def step(self, value): self.acc.append(value) def finalize(self): return percentile(self.acc, self.percentile) con.create_aggregate("percentile", 2, Percentile) def execute(self, command): """ COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS :param command: COMMAND FOR SQLITE :return: None """ if DEBUG: # EXECUTE IMMEDIATELY FOR BETTER STACK TRACE return self.query(command) if self.get_trace: trace = extract_stack(1) else: trace = None self.queue.add((command, None, None, trace)) def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ if not self.worker: self.worker = Thread.run("sqlite db thread", self._worker) signal = Signal() result = Data() self.queue.add((command, result, signal, None)) signal.wait() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def _worker(self, please_stop): global _load_extension_warning_sent if DEBUG: Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version) if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(coalesce(self.filename, ':memory:')) library_loc = File.new_instance(sys.modules[__name__].__file__, "../..") full_path = File.new_instance(library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath try: trace = extract_stack(0)[0] if self.upgrade: if os.name == 'nt': file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions.so") else: file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions") full_path = file.abspath self.db.enable_load_extension(True) self.db.execute("SELECT load_extension(" + self.quote_value(full_path) + ")") except Exception as e: if not _load_extension_warning_sent: _load_extension_warning_sent = True Log.warning("Could not load {{file}}}, doing without. (no SQRT for you!)", file=full_path, cause=e) try: while not please_stop: command, result, signal, trace = self.queue.pop(till=please_stop) if DEBUG_INSERT and command.strip().lower().startswith("insert"): Log.note("Running command\n{{command|indent}}", command=command) if DEBUG and not command.strip().lower().startswith("insert"): Log.note("Running command\n{{command|indent}}", command=command) with Timer("Run command", debug=DEBUG): if signal is not None: try: curr = self.db.execute(command) self.db.commit() result.meta.format = "table" result.header = [d[0] for d in curr.description] if curr.description else None result.data = curr.fetchall() if DEBUG and result.data: text = convert.table2csv(list(result.data)) Log.note("Result:\n{{data}}", data=text) except Exception as e: e = Except.wrap(e) result.exception = Except(ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally: signal.go() else: try: self.db.execute(command) self.db.commit() except Exception as e: e = Except.wrap(e) e.cause = Except( type=ERROR, template="Bad call to Sqlite", trace=trace ) Log.warning("Failure to execute", cause=e) except Exception as e: if not please_stop: Log.error("Problem with sql thread", e) finally: if DEBUG: Log.note("Database is closed") self.db.commit() self.db.close() def quote_column(self, column_name, table=None): return quote_column(column_name, table) def quote_value(self, value): return quote_value(value)
class ColumnList(Table, jx_base.Container): """ OPTIMIZED FOR THE PARTICULAR ACCESS PATTERNS USED """ def __init__(self, name): Table.__init__(self, "meta.columns") self.db_file = File("metadata." + name + ".sqlite") self.data = {} # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS) self.locker = Lock() self._schema = None self.db = sqlite3.connect( database=self.db_file.abspath, check_same_thread=False, isolation_level=None ) self.last_load = Null self.todo = Queue( "update columns to db" ) # HOLD (action, column) PAIR, WHERE action in ['insert', 'update'] self._db_load() Thread.run("update " + name, self._db_worker) @contextmanager def _db_transaction(self): self.db.execute(str("BEGIN")) try: yield self.db.execute(str("COMMIT")) except Exception as e: e = Except.wrap(e) self.db.execute(str("ROLLBACK")) Log.error("Transaction failed", cause=e) def _query(self, query): result = Data() curr = self.db.execute(query) result.meta.format = "table" result.header = [d[0] for d in curr.description] if curr.description else None result.data = curr.fetchall() return result def _db_create(self): with self._db_transaction(): self.db.execute( "CREATE TABLE " + db_table_name + sql_iso( sql_list( [ quote_column(c.name) + " " + json_type_to_sqlite_type[c.jx_type] for c in METADATA_COLUMNS ] + [ "PRIMARY KEY" + sql_iso( sql_list(map(quote_column, ["es_index", "es_column"])) ) ] ) ) ) for c in METADATA_COLUMNS: self._add(c) self._db_insert_column(c) def _db_load(self): self.last_load = Date.now() result = self._query( SQL_SELECT + "name" + SQL_FROM + "sqlite_master" + SQL_WHERE + SQL_AND.join(["name=" + db_table_name, "type=" + quote_value("table")]) ) if not result.data: self._db_create() return result = self._query( SQL_SELECT + all_columns + SQL_FROM + db_table_name + SQL_ORDERBY + sql_list(map(quote_column, ["es_index", "name", "es_column"])) ) with self.locker: for r in result.data: c = row_to_column(result.header, r) self._add(c) def _db_worker(self, please_stop): while not please_stop: try: with self._db_transaction(): result = self._query( SQL_SELECT + all_columns + SQL_FROM + db_table_name + SQL_WHERE + "last_updated > " + quote_value(self.last_load) + SQL_ORDERBY + sql_list(map(quote_column, ["es_index", "name", "es_column"])) ) with self.locker: for r in result.data: c = row_to_column(result.header, r) self._add(c) if c.last_updated > self.last_load: self.last_load = c.last_updated updates = self.todo.pop_all() DEBUG and updates and Log.note( "{{num}} columns to push to db", num=len(updates) ) for action, column in updates: while not please_stop: try: with self._db_transaction(): DEBUG and Log.note( "{{action}} db for {{table}}.{{column}}", action=action, table=column.es_index, column=column.es_column, ) if action is EXECUTE: self.db.execute(column) elif action is UPDATE: self.db.execute( "UPDATE" + db_table_name + "SET" + sql_list( [ "count=" + quote_value(column.count), "cardinality=" + quote_value(column.cardinality), "multi=" + quote_value(column.multi), "partitions=" + quote_value( value2json(column.partitions) ), "last_updated=" + quote_value(column.last_updated), ] ) + SQL_WHERE + SQL_AND.join( [ "es_index = " + quote_value(column.es_index), "es_column = " + quote_value(column.es_column), "last_updated < " + quote_value(column.last_updated), ] ) ) elif action is DELETE: self.db.execute( "DELETE FROM" + db_table_name + SQL_WHERE + SQL_AND.join( [ "es_index = " + quote_value(column.es_index), "es_column = " + quote_value(column.es_column), ] ) ) else: self._db_insert_column(column) break except Exception as e: e = Except.wrap(e) if "database is locked" in e: Log.note("metadata database is locked") Till(seconds=1).wait() break else: Log.warning("problem updataing database", cause=e) except Exception as e: Log.warning("problem updating database", cause=e) (Till(seconds=10) | please_stop).wait() def _db_insert_column(self, column): try: self.db.execute( "INSERT INTO" + db_table_name + sql_iso(all_columns) + "VALUES" + sql_iso( sql_list( [ quote_value(column[c.name]) if c.name not in ("nested_path", "partitions") else quote_value(value2json(column[c.name])) for c in METADATA_COLUMNS ] ) ) ) except Exception as e: e = Except.wrap(e) if "UNIQUE constraint failed" in e or " are not unique" in e: # THIS CAN HAPPEN BECAUSE todo HAS OLD COLUMN DATA self.todo.add((UPDATE, column), force=True) else: Log.error("do not know how to handle", cause=e) def __copy__(self): output = object.__new__(ColumnList) Table.__init__(output, "meta.columns") output.data = { t: {c: list(cs) for c, cs in dd.items()} for t, dd in self.data.items() } output.locker = Lock() output._schema = None return output def find(self, es_index, abs_column_name=None): with self.locker: if es_index.startswith("meta."): self._update_meta() if not abs_column_name: return [c for cs in self.data.get(es_index, {}).values() for c in cs] else: return self.data.get(es_index, {}).get(abs_column_name, []) def extend(self, columns): self.dirty = True with self.locker: for column in columns: self._add(column) def add(self, column): self.dirty = True with self.locker: canonical = self._add(column) if canonical == None: return column # ALREADY ADDED self.todo.add((INSERT if canonical is column else UPDATE, canonical)) return canonical def remove_table(self, table_name): del self.data[table_name] def _add(self, column): """ :param column: ANY COLUMN OBJECT :return: None IF column IS canonical ALREADY (NET-ZERO EFFECT) """ columns_for_table = self.data.setdefault(column.es_index, {}) existing_columns = columns_for_table.setdefault(column.name, []) for canonical in existing_columns: if canonical is column: return None if canonical.es_type == column.es_type: if column.last_updated > canonical.last_updated: for key in Column.__slots__: old_value = canonical[key] new_value = column[key] if new_value == None: pass # DO NOT BOTHER CLEARING OLD VALUES (LIKE cardinality AND paritiions) elif new_value == old_value: pass # NO NEED TO UPDATE WHEN NO CHANGE MADE (COMMON CASE) else: canonical[key] = new_value return canonical existing_columns.append(column) return column def _update_meta(self): if not self.dirty: return for mcl in self.data.get("meta.columns").values(): for mc in mcl: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.name] if value == None: pass else: count += 1 if is_list(value): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif is_data(value): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = Date.now() self.dirty = False def _all_columns(self): return [ column for t, cs in self.data.items() for _, css in cs.items() for column in css ] def __iter__(self): with self.locker: self._update_meta() return iter(self._all_columns()) def __len__(self): return self.data["meta.columns"]["es_index"].count def update(self, command): self.dirty = True try: command = wrap(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": with self.locker: del self.data[eq.es_index] self.todo.add( ( EXECUTE, "DELETE FROM " + db_table_name + SQL_WHERE + " es_index=" + quote_value(eq.es_index), ) ) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all( c[k] == v for k, v in eq.items() ) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": self.todo.add((DELETE, col)) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.todo.add((UPDATE, col)) except Exception as e: Log.error("should not happen", cause=e) def query(self, query): # NOT EXPECTED TO BE RUN Log.error("not") with self.locker: self._update_meta() if not self._schema: self._schema = Schema( ".", [c for cs in self.data["meta.columns"].values() for c in cs] ) snapshot = self._all_columns() from jx_python.containers.list_usingPythonList import ListContainer query.frum = ListContainer("meta.columns", snapshot, self._schema) return jx.run(query) def groupby(self, keys): with self.locker: self._update_meta() return jx.groupby(self.__iter__(), keys) @property def schema(self): if not self._schema: with self.locker: self._update_meta() self._schema = Schema( ".", [c for cs in self.data["meta.columns"].values() for c in cs] ) return self._schema @property def namespace(self): return self def get_table(self, table_name): if table_name != "meta.columns": Log.error("this container has only the meta.columns") return self def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ with self.locker: self._update_meta() output = [ { "table": c.es_index, "name": untype_path(c.name), "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "es_type": c.es_type, "type": c.jx_type, } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.jx_type not in STRUCT # and c.es_column != "_id" ] from jx_python.containers.list_usingPythonList import ListContainer return ListContainer( self.name, data=output, schema=jx_base.Schema("meta.columns", SIMPLE_METADATA_COLUMNS), )
class StructuredLogger_usingElasticSearch(StructuredLogger): @override def __init__(self, host, index, port=9200, type="log", max_size=1000, batch_size=100, kwargs=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ self.es = Cluster(kwargs).get_or_create_index( schema=mo_json.json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, kwargs=kwargs ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE)) Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): if params.get("template"): # DETECTED INNER TEMPLATE, ASSUME TRACE IS ON, SO DO NOT NEED THE OUTER TEMPLATE self.queue.add({"value": params}) else: template = strings.limit(template, 2000) self.queue.add({"value": {"template": template, "params": params}}, timeout=3 * MINUTE) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: Till(seconds=1).wait() messages = wrap(self.queue.pop_all()) if not messages: continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] try: for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() return scrubbed.append(_deep_json_to_string(message, depth=3)) finally: self.es.extend(scrubbed) bad_count = 0 except Exception as e: Log.warning("Problem inserting logs into ES", cause=e) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index) Till(seconds=30).wait() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=1).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e) def stop(self): with suppress_exception: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT with suppress_exception: self.queue.close()
class Clogger: # Singleton of the look-ahead scanner Clogger SINGLE_CLOGGER = None def __new__(cls, *args, **kwargs): if cls.SINGLE_CLOGGER is None: cls.SINGLE_CLOGGER = object.__new__(cls) return cls.SINGLE_CLOGGER def __init__(self, conn=None, tuid_service=None, start_workers=True, new_table=False, kwargs=None): try: self.config = kwargs self.conn = conn if conn else sql.Sql(self.config.database.name) self.hg_cache = HgMozillaOrg(kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService( kwargs=self.config.tuid, conn=self.conn, clogger=self ) self.rev_locker = Lock() self.working_locker = Lock() if new_table: with self.conn.transaction() as t: t.execute("DROP TABLE IF EXISTS csetLog") self.init_db() self.next_revnum = coalesce(self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1) self.csets_todo_backwards = Queue(name="Clogger.csets_todo_backwards") self.deletions_todo = Queue(name="Clogger.deletions_todo") self.maintenance_signal = Signal(name="Clogger.maintenance_signal") if 'tuid' in self.config: self.config = self.config.tuid self.disable_backfilling = False self.disable_tipfilling = False self.disable_deletion = False self.disable_maintenance = False self.backfill_thread = None self.tipfill_thread = None self.deletion_thread = None self.maintenance_thread = None # Make sure we are filled before allowing queries numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] if numrevs < MINIMUM_PERMANENT_CSETS: Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS) oldest_rev = 'tip' with self.conn.transaction() as t: tmp = t.query("SELECT min(revnum), revision FROM csetLog").data[0][1] if tmp: oldest_rev = tmp self._fill_in_range( MINIMUM_PERMANENT_CSETS - numrevs, oldest_rev, timestamp=False ) Log.note( "Table is filled with atleast {{minim}} entries.", minim=MINIMUM_PERMANENT_CSETS ) if start_workers: self.start_workers() except Exception as e: Log.warning("Cannot setup clogger: {{cause}}", cause=str(e)) def start_backfilling(self): if not self.backfill_thread: self.backfill_thread = Thread.run('clogger-backfill', self.fill_backward_with_list) def start_tipfillling(self): if not self.tipfill_thread: self.tipfill_thread = Thread.run('clogger-tip', self.fill_forward_continuous) def start_maintenance(self): if not self.maintenance_thread: self.maintenance_thread = Thread.run('clogger-maintenance', self.csetLog_maintenance) def start_deleter(self): if not self.deletion_thread: self.deletion_thread = Thread.run('clogger-deleter', self.csetLog_deleter) def start_workers(self): self.start_tipfillling() self.start_backfilling() self.start_maintenance() self.start_deleter() Log.note("Started clogger workers.") def init_db(self): with self.conn.transaction() as t: t.execute(''' CREATE TABLE IF NOT EXISTS csetLog ( revnum INTEGER PRIMARY KEY, revision CHAR(12) NOT NULL, timestamp INTEGER );''') def disable_all(self): self.disable_tipfilling = True self.disable_backfilling = True self.disable_maintenance = True self.disable_deletion = True def revnum(self): """ :return: max revnum that was added """ return coalesce(self.conn.get_one("SELECT max(revnum) as revnum FROM csetLog")[0], 0) def get_tip(self, transaction): return transaction.get_one( "SELECT max(revnum) as revnum, revision FROM csetLog" ) def get_tail(self, transaction): return transaction.get_one( "SELECT min(revnum) as revnum, revision FROM csetLog" ) def _get_clog(self, clog_url): try: Log.note("Searching through changelog {{url}}", url=clog_url) clog_obj = http.get_json(clog_url, retry=RETRY) return clog_obj except Exception as e: Log.error( "Unexpected error getting changset-log for {{url}}: {{error}}", url=clog_url, error=e ) def _get_one_revision(self, transaction, cset_entry): # Returns a single revision if it exists _, rev, _ = cset_entry return transaction.get_one("SELECT revision FROM csetLog WHERE revision=?", (rev,)) def _get_one_revnum(self, transaction, rev): # Returns a single revnum if it exists return transaction.get_one("SELECT revnum FROM csetLog WHERE revision=?", (rev,)) def _get_revnum_range(self, transaction, revnum1, revnum2): # Returns a range of revision numbers (that is inclusive) high_num = max(revnum1, revnum2) low_num = min(revnum1, revnum2) return transaction.query( "SELECT revnum, revision FROM csetLog WHERE " "revnum >= " + str(low_num) + " AND revnum <= " + str(high_num) ).data def recompute_table_revnums(self): ''' Recomputes the revnums for the csetLog table by creating a new table, and copying csetLog to it. The INTEGER PRIMARY KEY in the temp table auto increments as rows are added. IMPORTANT: Only call this after acquiring the lock `self.working_locker`. :return: ''' with self.conn.transaction() as t: t.execute(''' CREATE TABLE temp ( revnum INTEGER PRIMARY KEY, revision CHAR(12) NOT NULL, timestamp INTEGER );''') t.execute( "INSERT INTO temp (revision, timestamp) " "SELECT revision, timestamp FROM csetlog ORDER BY revnum ASC" ) t.execute("DROP TABLE csetLog;") t.execute("ALTER TABLE temp RENAME TO csetLog;") def check_for_maintenance(self): ''' Returns True if the maintenance worker should be run now, and False otherwise. :return: ''' numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0] Log.note("Number of csets in csetLog table: {{num}}", num=numrevs) if numrevs >= SIGNAL_MAINTENANCE_CSETS: return True return False def add_cset_entries(self, ordered_rev_list, timestamp=False, number_forward=True): ''' Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered based on how changesets are found in the changelog. Going forwards or backwards is dealt with by flipping the list :param ordered_cset_list: Order given from changeset log searching. :param timestamp: If false, records are kept indefinitely but if holes exist: (delete, None, delete, None) those delete's with None's around them will not be deleted. :param numbered: If True, this function will number the revision list by going forward from max(revNum), else it'll go backwards from revNum, then add X to all revnums and self.next_revnum where X is the length of ordered_rev_list :return: ''' with self.conn.transaction() as t: current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0] current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0] if not current_min or not current_max: current_min = 0 current_max = 0 direction = -1 start = current_min - 1 if number_forward: direction = 1 start = current_max + 1 ordered_rev_list = ordered_rev_list[::-1] insert_list = [ ( start + direction * count, rev, int(time.time()) if timestamp else -1 ) for count, rev in enumerate(ordered_rev_list) ] # In case of overlapping requests fmt_insert_list = [] for cset_entry in insert_list: tmp = self._get_one_revision(t, cset_entry) if not tmp: fmt_insert_list.append(cset_entry) for _, tmp_insert_list in jx.groupby(fmt_insert_list, size=SQL_CSET_BATCH_SIZE): t.execute( "INSERT INTO csetLog (revnum, revision, timestamp)" + " VALUES " + sql_list( quote_set((revnum, revision, timestamp)) for revnum, revision, timestamp in tmp_insert_list ) ) # Move the revision numbers forward if needed self.recompute_table_revnums() # Start a maintenance run if needed if self.check_for_maintenance(): Log.note("Scheduling maintenance run on clogger.") self.maintenance_signal.go() def _fill_in_range(self, parent_cset, child_cset, timestamp=False, number_forward=True): ''' Fills cset logs in a certain range. 'parent_cset' can be an int and in that case, we get that many changesets instead. If parent_cset is an int, then we consider that we are going backwards (number_forward is False) and we ignore the first changeset of the first log, and we ignore the setting for number_forward. Otherwise, we continue until we find the given 'parent_cset'. :param parent_cset: :param child_cset: :param timestamp: :param number_forward: :return: ''' csets_to_add = [] found_parent = False find_parent = False if type(parent_cset) != int: find_parent = True elif parent_cset >= MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG: Log.warning( "Requested number of new changesets {{num}} is too high. " "Max number that can be requested is {{maxnum}}.", num=parent_cset, maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG ) return None csets_found = 0 clogs_seen = 0 final_rev = child_cset while not found_parent and clogs_seen < MAX_BACKFILL_CLOGS: clog_url = str(HG_URL) + "/" + self.config.hg.branch + "/json-log/" + final_rev clog_obj = self._get_clog(clog_url) clog_csets_list = list(clog_obj['changesets']) for clog_cset in clog_csets_list[:-1]: if not number_forward and csets_found <= 0: # Skip this entry it already exists csets_found += 1 continue nodes_cset = clog_cset['node'][:12] if find_parent: if nodes_cset == parent_cset: found_parent = True if not number_forward: # When going forward this entry is # the given parent csets_to_add.append(nodes_cset) break else: if csets_found + 1 > parent_cset: found_parent = True if not number_forward: # When going forward this entry is # the given parent (which is supposed # to already exist) csets_to_add.append(nodes_cset) break csets_found += 1 csets_to_add.append(nodes_cset) if found_parent == True: break clogs_seen += 1 final_rev = clog_csets_list[-1]['node'][:12] if found_parent: self.add_cset_entries(csets_to_add, timestamp=timestamp, number_forward=number_forward) else: Log.warning( "Couldn't find the end of the request for {{request}}. " "Max number that can be requested through _fill_in_range is {{maxnum}}.", request={ 'parent_cset': parent_cset, 'child_cset':child_cset, 'number_forward': number_forward }, maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG ) return None return csets_to_add def initialize_to_range(self, old_rev, new_rev, delete_old=True): ''' Used in service testing to get to very old changesets quickly. :param old_rev: The oldest revision to keep :param new_rev: The revision to start searching from :return: ''' old_settings = [ self.disable_tipfilling, self.disable_backfilling, self.disable_maintenance, self.disable_deletion ] self.disable_tipfilling = True self.disable_backfilling = True self.disable_maintenance = True self.disable_deletion = True old_rev = old_rev[:12] new_rev = new_rev[:12] with self.working_locker: if delete_old: with self.conn.transaction() as t: t.execute("DELETE FROM csetLog") with self.conn.transaction() as t: t.execute( "INSERT INTO csetLog (revision, timestamp) VALUES " + quote_set((new_rev, -1)) ) self._fill_in_range(old_rev, new_rev, timestamp=True, number_forward=False) self.disable_tipfilling = old_settings[0] self.disable_backfilling = old_settings[1] self.disable_maintenance = old_settings[2] self.disable_deletion = old_settings[3] def fill_backward_with_list(self, please_stop=None): ''' Expects requests of the tuple form: (parent_cset, timestamp) parent_cset can be an int X to go back by X changesets, or a string to search for going backwards in time. If timestamp is false, no timestamps will be added to the entries. :param please_stop: :return: ''' while not please_stop: try: request = self.csets_todo_backwards.pop(till=please_stop) if please_stop: break # If backfilling is disabled, all requests # are ignored. if self.disable_backfilling: Till(till=CSET_BACKFILL_WAIT_TIME).wait() continue if request: parent_cset, timestamp = request else: continue with self.working_locker: with self.conn.transaction() as t: parent_revnum = self._get_one_revnum(t, parent_cset) if parent_revnum: continue with self.conn.transaction() as t: _, oldest_revision = self.get_tail(t) self._fill_in_range( parent_cset, oldest_revision, timestamp=timestamp, number_forward=False ) Log.note("Finished {{cset}}", cset=parent_cset) except Exception as e: Log.warning("Unknown error occurred during backfill: ", cause=e) def update_tip(self): ''' Returns False if the tip is already at the newest, or True if an update has taken place. :return: ''' clog_obj = self._get_clog( str(HG_URL) + "/" + self.config.hg.branch + "/json-log/tip" ) # Get current tip in DB with self.conn.transaction() as t: _, newest_known_rev = self.get_tip(t) # If we are still at the newest, wait for CSET_TIP_WAIT_TIME seconds # before checking again. first_clog_entry = clog_obj['changesets'][0]['node'][:12] if newest_known_rev == first_clog_entry: return False csets_to_gather = None if not newest_known_rev: Log.note( "No revisions found in table, adding {{minim}} entries...", minim=MINIMUM_PERMANENT_CSETS ) csets_to_gather = MINIMUM_PERMANENT_CSETS found_newest_known = False csets_to_add = [] csets_found = 0 clogs_seen = 0 Log.note("Found new revisions. Updating csetLog tip to {{rev}}...", rev=first_clog_entry) while not found_newest_known and clogs_seen < MAX_TIPFILL_CLOGS: clog_csets_list = list(clog_obj['changesets']) for clog_cset in clog_csets_list[:-1]: nodes_cset = clog_cset['node'][:12] if not csets_to_gather: if nodes_cset == newest_known_rev: found_newest_known = True break else: if csets_found >= csets_to_gather: found_newest_known = True break csets_found += 1 csets_to_add.append(nodes_cset) if not found_newest_known: # Get the next page clogs_seen += 1 final_rev = clog_csets_list[-1]['node'][:12] clog_url = str(HG_URL) + "/" + self.config.hg.branch + "/json-log/" + final_rev clog_obj = self._get_clog(clog_url) if clogs_seen >= MAX_TIPFILL_CLOGS: Log.error( "Too many changesets, can't find last tip or the number is too high: {{rev}}. " "Maximum possible to request is {{maxnum}}", rev=coalesce(newest_known_rev, csets_to_gather), maxnum=MAX_TIPFILL_CLOGS * CHANGESETS_PER_CLOG ) return False with self.working_locker: Log.note("Adding {{csets}}", csets=csets_to_add) self.add_cset_entries(csets_to_add, timestamp=False) return True def fill_forward_continuous(self, please_stop=None): while not please_stop: try: while not please_stop and not self.disable_tipfilling and self.update_tip(): pass (please_stop | Till(seconds=CSET_TIP_WAIT_TIME)).wait() except Exception as e: Log.warning("Unknown error occurred during tip filling:", cause=e) def csetLog_maintenance(self, please_stop=None): ''' Handles deleting old csetLog entries and timestamping revisions once they pass the length for permanent storage for deletion later. :param please_stop: :return: ''' while not please_stop: try: # Wait until something signals the maintenance cycle # to begin (or end). (self.maintenance_signal | please_stop).wait() if please_stop: break if self.disable_maintenance: continue Log.warning( "Starting clog maintenance. Since this doesn't start often, " "we need to explicitly see when it's started with this warning." ) # Reset signal so we don't request # maintenance infinitely. with self.maintenance_signal.lock: self.maintenance_signal._go = False with self.working_locker: all_data = None with self.conn.transaction() as t: all_data = sorted( t.get("SELECT revnum, revision, timestamp FROM csetLog"), key=lambda x: int(x[0]) ) # Restore maximum permanents (if overflowing) new_data = [] modified = False for count, (revnum, revision, timestamp) in enumerate(all_data[::-1]): if count < MINIMUM_PERMANENT_CSETS: if timestamp != -1: modified = True new_data.append((revnum, revision, -1)) else: new_data.append((revnum, revision, timestamp)) elif type(timestamp) != int or timestamp == -1: modified = True new_data.append((revnum, revision, int(time.time()))) else: new_data.append((revnum, revision, timestamp)) # Delete annotations at revisions with timestamps # that are too old. The csetLog entries will have # their timestamps reset here. new_data1 = [] annrevs_to_del = [] current_time = time.time() for count, (revnum, revision, timestamp) in enumerate(new_data[::-1]): new_timestamp = timestamp if timestamp != -1: if current_time >= timestamp + TIME_TO_KEEP_ANNOTATIONS.seconds: modified = True new_timestamp = current_time annrevs_to_del.append(revision) new_data1.append((revnum, revision, new_timestamp)) if len(annrevs_to_del) > 0: # Delete any latestFileMod and annotation entries # that are too old. Log.note( "Deleting annotations and latestFileMod for revisions for being " "older than {{oldest}}: {{revisions}}", oldest=TIME_TO_KEEP_ANNOTATIONS, revisions=annrevs_to_del ) with self.conn.transaction() as t: t.execute( "DELETE FROM latestFileMod WHERE revision IN " + quote_set(annrevs_to_del) ) t.execute( "DELETE FROM annotations WHERE revision IN " + quote_set(annrevs_to_del) ) # Delete any overflowing entries new_data2 = new_data1 reved_all_data = all_data[::-1] deleted_data = reved_all_data[MAXIMUM_NONPERMANENT_CSETS:] delete_overflowing_revstart = None if len(deleted_data) > 0: _, delete_overflowing_revstart, _ = deleted_data[0] new_data2 = set(all_data) - set(deleted_data) # Update old frontiers if requested, otherwise # they will all get deleted by the csetLog_deleter # worker if UPDATE_VERY_OLD_FRONTIERS: _, max_revision, _ = all_data[-1] for _, revision, _ in deleted_data: with self.conn.transaction() as t: old_files = t.get( "SELECT file FROM latestFileMod WHERE revision=?", (revision,) ) if old_files is None or len(old_files) <= 0: continue self.tuid_service.get_tuids_from_files( old_files, max_revision, going_forward=True, ) still_exist = True while still_exist and not please_stop: Till(seconds=TUID_EXISTENCE_WAIT_TIME).wait() with self.conn.transaction() as t: old_files = t.get( "SELECT file FROM latestFileMod WHERE revision=?", (revision,) ) if old_files is None or len(old_files) <= 0: still_exist = False # Update table and schedule a deletion if modified: with self.conn.transaction() as t: insert_into_db_chunked( t, new_data2, "INSERT OR REPLACE INTO csetLog (revnum, revision, timestamp) VALUES " ) if not deleted_data: continue Log.note("Scheduling {{num_csets}} for deletion", num_csets=len(deleted_data)) self.deletions_todo.add(delete_overflowing_revstart) except Exception as e: Log.warning("Unexpected error occured while maintaining csetLog, continuing to try: ", cause=e) return def csetLog_deleter(self, please_stop=None): ''' Deletes changesets from the csetLog table and also changesets from the annotation table that have revisions matching the given changesets. Accepts lists of csets from self.deletions_todo. :param please_stop: :return: ''' while not please_stop: try: request = self.deletions_todo.pop(till=please_stop) if please_stop: break # If deletion is disabled, ignore the current # request - it will need to be re-requested. if self.disable_deletion: Till(till=CSET_DELETION_WAIT_TIME).wait() continue with self.working_locker: first_cset = request # Since we are deleting and moving stuff around in the # TUID tables, we need everything to be contained in # one transaction with no interruptions. with self.conn.transaction() as t: revnum = self._get_one_revnum(t, first_cset)[0] csets_to_del = t.get( "SELECT revnum, revision FROM csetLog WHERE revnum <= ?", (revnum,) ) csets_to_del = [cset for _, cset in csets_to_del] existing_frontiers = t.query( "SELECT revision FROM latestFileMod WHERE revision IN " + quote_set(csets_to_del) ).data existing_frontiers = [existing_frontiers[i][0] for i, _ in enumerate(existing_frontiers)] Log.note( "Deleting all annotations and changeset log entries with revisions in the list: {{csets}}", csets=csets_to_del ) if len(existing_frontiers) > 0: # This handles files which no longer exist anymore in # the main branch. Log.note( "Deleting existing frontiers for revisions: {{revisions}}", revisions=existing_frontiers ) t.execute( "DELETE FROM latestFileMod WHERE revision IN " + quote_set(existing_frontiers) ) Log.note("Deleting annotations...") t.execute( "DELETE FROM annotations WHERE revision IN " + quote_set(csets_to_del) ) Log.note( "Deleting {{num_entries}} csetLog entries...", num_entries=len(csets_to_del) ) t.execute( "DELETE FROM csetLog WHERE revision IN " + quote_set(csets_to_del) ) # Recalculate the revnums self.recompute_table_revnums() except Exception as e: Log.warning("Unexpected error occured while deleting from csetLog:", cause=e) Till(seconds=CSET_DELETION_WAIT_TIME).wait() return def get_old_cset_revnum(self, revision): self.csets_todo_backwards.add((revision, True)) revnum = None timeout = Till(seconds=BACKFILL_REVNUM_TIMEOUT) while not timeout: with self.conn.transaction() as t: revnum = self._get_one_revnum(t, revision) if revnum and revnum[0] >= 0: break elif revnum[0] < 0: Log.note("Waiting for table to recompute...") else: Log.note("Waiting for backfill to complete...") Till(seconds=CSET_BACKFILL_WAIT_TIME).wait() if timeout: Log.error( "Cannot find revision {{rev}} after waiting {{timeout}} seconds", rev=revision, timeout=BACKFILL_REVNUM_TIMEOUT ) return revnum def get_revnnums_from_range(self, revision1, revision2): with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) revnum2 = self._get_one_revnum(t, revision2) if not revnum1 or not revnum2: did_an_update = self.update_tip() if did_an_update: with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) revnum2 = self._get_one_revnum(t, revision2) if not revnum1: revnum1 = self.get_old_cset_revnum(revision1) # Refresh the second entry with self.conn.transaction() as t: revnum2 = self._get_one_revnum(t, revision2) if not revnum2: revnum2 = self.get_old_cset_revnum(revision2) # The first revnum might change also with self.conn.transaction() as t: revnum1 = self._get_one_revnum(t, revision1) with self.conn.transaction() as t: result = self._get_revnum_range(t, revnum1[0], revnum2[0]) return sorted( result, key=lambda x: int(x[0]) )
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ @override def __new__(cls, kwargs, *args, **_kwargs): es_cluster = elasticsearch.Cluster(kwargs) output = known_clusters.get(id(es_cluster)) if output is None: output = object.__new__(cls) known_clusters[id(es_cluster)] = output return output @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = Relation_usingList() self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList() self.alias_to_query_paths = { "meta.columns": [['.']], "meta.tables": [['.']] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [ # TableDesc("meta.columns", None, ".", Date.now()), # TableDesc("meta.tables", None, ".", Date.now()) ], jx_base.Schema(".", table_columns) ) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def namespace(self): return self.meta.columns.namespace @property def url(self): return self.es_cluster.url / self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata(force=update_required) indexes = self.index_to_alias.get_domain(alias) props = [ (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if i in indexes for t, m in [_get_best_type_from_mapping(d.mappings)] ] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(alias, mapping, meta) table_desc.timestamp = es_last_updated def _parse_properties(self, alias, mapping, meta): abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) if any(c.cardinality == 0 and c.names['.'] != '_id' for c in abs_columns): Log.warning( "Some columns are not stored {{names}}", names=[ ".".join((c.es_index, c.names['.'])) for c in abs_columns if c.cardinality == 0 ] ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(SELF_PATH) query_paths.append(ROOT_PATH) self.alias_to_query_paths[alias] = query_paths for i in self.index_to_alias.get_domain(alias): self.alias_to_query_paths[i] = query_paths # ADD RELATIVE NAMES for abs_column in abs_columns: abs_column.last_updated = None abs_column.jx_type = jx_type(abs_column) for query_path in query_paths: abs_column.names[query_path[0]] = relative_field(abs_column.names["."], query_path[0]) self.todo.add(self.meta.columns.add(abs_column)) pass def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias[name] def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: last_update = MAX([ self.es_cluster.index_last_updated[i] for i in self.index_to_alias.get_domain(alias) ]) table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc( name=alias, url=None, query_path=['.'], timestamp=Date.MIN ) with self.meta.tables.locker: self.meta.tables.add(table) self._reload_columns(table) elif force or table.timestamp < last_update: self._reload_columns(table) columns = self.meta.columns.find(alias, column_name) columns = jx.sort(columns, "names.\\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: if len(columns) > 10: Log.note("waiting for {{num}} columns to update", num=len([c for c in columns if not c.last_updated])) else: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 multi = 1 else: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}} }, "size": 0 }) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[Date(t).format() for t in wrap(old_columns).last_updated] ) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG): if column.es_index in self.index_does_not_exist: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) continue if column.jx_type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE): column.last_updated = Date.now() continue elif column.last_updated >= Date.now()-TOO_OLD: continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) else: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if c.last_updated >= Date.now()-TOO_OLD: continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": c}, silent=not DEBUG, too_long=0.05): self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) def get_table(self, name): if name == "meta.columns": return self.meta.columns # return self.meta.columns with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema query_path = split_field(name) root, rest = query_path[0], join_field(query_path[1:]) return self.get_snowflake(root).get_schema(rest)
class StructuredLogger_usingElasticSearch(StructuredLogger): @override def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] schema = json2value(value2json(SCHEMA), leaves=True) schema.mappings[type].properties["~N~"].type = "nested" self.es = Cluster(kwargs).get_or_create_index( schema=schema, limit_replicas=True, typed=True, kwargs=kwargs, ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): try: params.template = strings.limit(params.template, 2000) params.format = None self.queue.add({"value": _deep_json_to_string(params, 3)}, timeout=3 * 60) except Exception as e: sys.stdout.write(text_type(Except.wrap(e))) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: messages = wrap(self.queue.pop_all()) if not messages: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() continue try: messages = flatten_causal_chain(message.value) scrubbed.append( { "value": [ _deep_json_to_string(m, depth=3) for m in messages ] } ) except Exception as e: Log.warning("Problem adding to scrubbed list", cause=e) self.es.extend(scrubbed) bad_count = 0 except Exception as f: Log.warning("Problem inserting logs into ES", cause=f) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning( "Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index, ) Till(seconds=PAUSE_AFTER_BAD_INSERT).wait() self.es.flush() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e) def stop(self): with suppress_exception: self.queue.add(THREAD_STOP) # BE PATIENT, LET REST OF MESSAGE BE SENT with suppress_exception: self.queue.close() self.worker.join()