def main(): try: settings = startup.read_settings(defs=[{ "name": ["--id"], "help": "id (prefix, really) to process", "type": str, "dest": "id", "required": False }]) constants.set(settings.constants) Log.start(settings.debug) queries.config.default = { "type": "elasticsearch", "settings": settings.elasticsearch.copy() } if settings.args.id: work_queue = Queue("local work queue") work_queue.extend(parse_id_argument(settings.args.id)) else: work_queue = aws.Queue(settings=settings.work_queue) Log.note("Listen to queue {{queue}}, and read off of {{s3}}", queue=settings.work_queue.name, s3=settings.source.bucket) es = MultiDayIndex(settings.elasticsearch, queue_size=100000) threads = [] please_stop = Signal() for _ in range(settings.threads): p = Thread.run("copy to es", copy2es, es, settings, work_queue, please_stop=please_stop) threads.append(p) def monitor_progress(please_stop): while not please_stop: Log.note("Remaining: {{num}}", num=len(work_queue)) Thread.sleep(seconds=10) Thread.run(name="monitor progress", target=monitor_progress, please_stop=please_stop) aws.capture_termination_signal(please_stop) Thread.wait_for_shutdown_signal(please_stop=please_stop, allow_exit=True) please_stop.go() Log.note("Shutdown started") except Exception, e: Log.error("Problem with etl", e)
def find_changeset(self, revision, please_stop=False): locker = Lock() output = [] queue = Queue("branches", max=2000) queue.extend(self.branches) queue.add(Thread.STOP) problems = [] def _find(please_stop): for b in queue: if please_stop: return try: url = b.url + "json-info?node=" + revision response = http.get(url, timeout=30) if response.status_code == 200: with locker: output.append(b) Log.note("{{revision}} found at {{url}}", url=url, revision=revision) except Exception, f: problems.append(f)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @use_settings def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.lists import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) self.meta = Dict() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ListContainer( "meta.columns", [], wrap({c.name: c for c in column_columns})) self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return self.meta.tables.query( {"where": { "eq": { "name": table_name } }}) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = [ r for r in self.meta.columns.data if r.table == c.table and r.name == c.name ] if not existing_columns: self.meta.columns.add(c) Log.note("todo: {{table}}.{{column}}", table=c.table, column=c.es_column) self.todo.add(c) # MARK meta.columns AS DIRTY TOO cols = [ r for r in self.meta.columns.data if r.table == "meta.columns" ] for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical.relative and not c.relative: return # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS for key in Column.__slots__: canonical[key] = c[key] Log.note("todo: {{table}}.{{column}}", table=canonical.table, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None, metadata=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE if not metadata: metadata = self.default_es.get_metadata(force=True) def parse_all(please_stop): for abs_index, meta in jx.sort(metadata.indices.items(), { "value": 0, "sort": -1 }): if meta.index != abs_index: continue for _, properties in meta.mappings.items(): if please_stop: return self._parse_properties(abs_index, properties, meta) if table: for abs_index, meta in jx.sort(metadata.indices.items(), { "value": 0, "sort": -1 }): if table == meta.index: for _, properties in meta.mappings.items(): self._parse_properties(abs_index, properties, meta) return if table == abs_index: self._get_columns(table=meta.index, metadata=metadata) return else: self.parser = Thread.run("parse properties", parse_all) def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column. startswith("previous_values.cf_") and not r.es_index.startswith( "debug")) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() if query_path: c.table = c.es_index + "." + query_path.last() else: c.table = c.es_index with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) if query_path: c.table = alias + "." + query_path.last() else: c.table = alias self._upsert_column(c) # EACH query_path IS A LIST OF EVER-INCREASING PATHS THROUGH EACH NESTED LEVEL query_paths = wrap([[c.es_column] for c in abs_columns if c.type == "nested"]) for a, b in itertools.product(query_paths, query_paths): aa = a.last() bb = b.last() if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) < len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(0, aa) break query_paths.append([]) for c in abs_columns: # ADD RELATIVE COLUMNS full_path = listwrap(c.nested_path) abs_depth = len(full_path) abs_parent = coalesce(full_path.last(), "") for query_path in query_paths: rel_depth = len(query_path) # ABSOLUTE add_column(copy(c), query_path) cc = copy(c) cc.relative = True if not query_path: add_column(cc, query_path) continue rel_parent = query_path.last() if c.es_column.startswith(rel_parent + "."): cc.name = c.es_column[len(rel_parent) + 1:] add_column(cc, query_path) elif c.es_column == rel_parent: cc.name = "." add_column(cc, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent.startswith(abs_parent + "."): cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error") def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.as_dict()))) def get_columns(self, table_name, column_name=None, fail_when_not_found=False): """ RETURN METADATA COLUMNS """ try: with self.meta.columns.locker: columns = [ c for c in self.meta.columns.data if c.table == table_name and ( column_name is None or c.name == column_name) ] if columns: columns = jx.sort(columns, "name") if fail_when_not_found: # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all( columns.get("last_updated")): Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.table + "." + c.es_column for c in columns if not c.last_updated ]) Thread.sleep(seconds=1) return columns elif all(columns.get("last_updated")): return columns except Exception, e: Log.error("Not expected", cause=e) if fail_when_not_found: if column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name) else: self._get_columns(table=table_name) Log.error("no columns for {{table}}", table=table_name) self._get_columns(table=join_field(split_field(table_name)[0:1])) return self.get_columns(table_name=table_name, column_name=column_name, fail_when_not_found=True)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @use_settings def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.lists import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) self.meta=Dict() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ListContainer("meta.columns", [], wrap({c.name: c for c in column_columns})) self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return self.meta.tables.query({"where": {"eq": {"name": table_name}}}) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = [r for r in self.meta.columns.data if r.table == c.table and r.name == c.name] if not existing_columns: self.meta.columns.add(c) Log.note("todo: {{table}}.{{column}}", table=c.table, column=c.es_column) self.todo.add(c) # MARK meta.columns AS DIRTY TOO cols = [r for r in self.meta.columns.data if r.table == "meta.columns"] for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical.relative and not c.relative: return # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS for key in Column.__slots__: canonical[key] = c[key] Log.note("todo: {{table}}.{{column}}", table=canonical.table, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None, metadata=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE if not metadata: metadata = self.default_es.get_metadata(force=True) def parse_all(please_stop): for abs_index, meta in jx.sort(metadata.indices.items(), {"value": 0, "sort": -1}): if meta.index != abs_index: continue for _, properties in meta.mappings.items(): if please_stop: return self._parse_properties(abs_index, properties, meta) if table: for abs_index, meta in jx.sort(metadata.indices.items(), {"value": 0, "sort": -1}): if table == meta.index: for _, properties in meta.mappings.items(): self._parse_properties(abs_index, properties, meta) return if table == abs_index: self._get_columns(table=meta.index, metadata=metadata) return else: self.parser = Thread.run("parse properties", parse_all) def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() if query_path: c.table = c.es_index + "." + query_path.last() else: c.table = c.es_index with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) if query_path: c.table = alias + "." + query_path.last() else: c.table = alias self._upsert_column(c) # EACH query_path IS A LIST OF EVER-INCREASING PATHS THROUGH EACH NESTED LEVEL query_paths = wrap([[c.es_column] for c in abs_columns if c.type == "nested"]) for a, b in itertools.product(query_paths, query_paths): aa = a.last() bb = b.last() if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) < len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(0, aa) break query_paths.append([]) for c in abs_columns: # ADD RELATIVE COLUMNS full_path = listwrap(c.nested_path) abs_depth = len(full_path) abs_parent = coalesce(full_path.last(), "") for query_path in query_paths: rel_depth = len(query_path) # ABSOLUTE add_column(copy(c), query_path) cc = copy(c) cc.relative = True if not query_path: add_column(cc, query_path) continue rel_parent = query_path.last() if c.es_column.startswith(rel_parent+"."): cc.name = c.es_column[len(rel_parent)+1:] add_column(cc, query_path) elif c.es_column == rel_parent: cc.name = "." add_column(cc, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent.startswith(abs_parent+"."): cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error") def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.as_dict() ))) def get_columns(self, table_name, column_name=None, fail_when_not_found=False): """ RETURN METADATA COLUMNS """ try: with self.meta.columns.locker: columns = [c for c in self.meta.columns.data if c.table == table_name and (column_name is None or c.name==column_name)] if columns: columns = jx.sort(columns, "name") if fail_when_not_found: # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Thread.sleep(seconds=1) return columns elif all(columns.get("last_updated")): return columns except Exception, e: Log.error("Not expected", cause=e) if fail_when_not_found: if column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name) else: self._get_columns(table=table_name) Log.error("no columns for {{table}}", table=table_name) self._get_columns(table=join_field(split_field(table_name)[0:1])) return self.get_columns(table_name=table_name, column_name=column_name, fail_when_not_found=True)
def loop(source, coverage_summary_index, settings, please_stop): try: cluster = elasticsearch.Cluster(source) aliases = cluster.get_aliases() candidates = [] for pairs in aliases: if pairs.alias == source.index: candidates.append(pairs.index) candidates = jx.sort(candidates, {".": "desc"}) for index_name in candidates: coverage_index = elasticsearch.Index(index=index_name, read_only=False, settings=source) push_date_filter = unicode2Date(coverage_index.settings.index[-15::], elasticsearch.INDEX_DATE_FORMAT) while not please_stop: # IDENTIFY NEW WORK Log.note("Working on index {{index}}", index=index_name) coverage_index.refresh() todo = http.post_json(settings.url, json={ "from": "coverage", "groupby": ["source.file.name", "build.revision12"], "where": {"and": [ {"missing": "source.method.name"}, {"missing": "source.file.min_line_siblings"}, {"gte": {"repo.push.date": push_date_filter}} ]}, "format": "list", "limit": coalesce(settings.batch_size, 100) }) if not todo.data: break queue = Queue("pending source files to review") queue.extend(todo.data[0:coalesce(settings.batch_size, 100):]) threads = [ Thread.run( "processor" + unicode(i), process_batch, queue, coverage_index, coverage_summary_index, settings, please_stop=please_stop ) for i in range(NUM_THREAD) ] # ADD STOP MESSAGE queue.add(Thread.STOP) # WAIT FOR THEM TO COMPLETE for t in threads: t.join() please_stop.go() return except Exception, e: Log.warning("Problem processing", cause=e)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @use_settings def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Dict() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.table, c.name) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: Log.note("todo: {{table}}::{{column}}", table=c.table, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical.relative and not c.relative: return # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS for key in Column.__slots__: canonical[key] = c[key] Log.note("todo: {{table}}::{{column}}", table=canonical.table, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE meta = self.es_metadata.indices[table] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[table] for _, properties in meta.mappings.items(): self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") and r.es_column.find("=")==-1 and r.es_column.find(" ")==-1 ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index]+split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias]+split_field(query_path[0])) self._upsert_column(c) # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: full_path = abs_column.nested_path abs_depth = len(full_path)-1 abs_parent = full_path[1] if abs_depth else "" for query_path in query_paths: rel_depth = len(query_path)-1 rel_parent = query_path[0] rel_column = copy(abs_column) rel_column.relative = True add_column(copy(abs_column), query_path) if rel_parent == ".": add_column(rel_column, query_path) elif abs_column.es_column.startswith(rel_parent+"."): rel_column.name = abs_column.es_column[len(rel_parent)+1:] add_column(rel_column, query_path) elif abs_column.es_column == rel_parent: rel_column.name = "." add_column(rel_column, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent.startswith(abs_parent+"."): rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error") def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.as_dict() ))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table( name=short_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Thread.sleep(seconds=1) return columns except Exception, e: Log.error("Not expected", cause=e) if column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name) else: self._get_columns(table=table_name) Log.error("no columns for {{table}}?!", table=table_name)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @use_settings def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now() - OLD_METADATA self.meta = Dict() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap( [t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.table, c.name) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: Log.note("todo: {{table}}::{{column}}", table=c.table, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical.relative and not c.relative: return # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS for key in Column.__slots__: canonical[key] = c[key] Log.note("todo: {{table}}::{{column}}", table=canonical.table, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE meta = self.es_metadata.indices[table] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[table] for _, properties in meta.mappings.items(): self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column. startswith("previous_values.cf_") and not r.es_index.startswith( "debug") and r.es_column.find("=") == -1 and r.es_column.find( " ") == -1) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index] + split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias] + split_field(query_path[0])) self._upsert_column(c) # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: full_path = abs_column.nested_path abs_depth = len(full_path) - 1 abs_parent = full_path[1] if abs_depth else "" for query_path in query_paths: rel_depth = len(query_path) - 1 rel_parent = query_path[0] rel_column = copy(abs_column) rel_column.relative = True add_column(copy(abs_column), query_path) if rel_parent == ".": add_column(rel_column, query_path) elif abs_column.es_column.startswith(rel_parent + "."): rel_column.name = abs_column.es_column[len(rel_parent ) + 1:] add_column(rel_column, query_path) elif abs_column.es_column == rel_parent: rel_column.name = "." add_column(rel_column, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED rel_column.name = "." + ( "." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent.startswith(abs_parent + "."): rel_column.name = "." + ( "." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error") def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.as_dict()))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table(name=short_name, url=None, query_path=None, timestamp=Date.now()) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now( ) - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): Log.note("waiting for columns to update {{columns|json}}", columns=[ c.table + "." + c.es_column for c in columns if not c.last_updated ]) Thread.sleep(seconds=1) return columns except Exception, e: Log.error("Not expected", cause=e) if column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name) else: self._get_columns(table=table_name) Log.error("no columns for {{table}}?!", table=table_name)
class FromESMetadata(object): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @use_settings def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.lists import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) table_columns = metadata_tables() column_columns = metadata_columns() self.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns})) self.columns = ListContainer("meta.columns", [], wrap({c.name: c for c in column_columns})) self.columns.insert(column_columns) self.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.tables.locker: return self.tables.query({"where": {"eq": {"name": table_name}}}) def upsert_column(self, c): existing_columns = filter(lambda r: r.table == c.table and r.abs_name == c.abs_name, self.columns.data) if not existing_columns: self.columns.add(c) cols = filter(lambda r: r.table == "meta.columns", self.columns.data) for cc in cols: cc.partitions = cc.cardinality = cc.last_updated = None self.todo.add(c) self.todo.extend(cols) else: set_default(existing_columns[0], c) self.todo.add(existing_columns[0]) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.abs_name==d.abs_name and c.table==d.table and c!=d: Log.error("") def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE alias_done = set() index = split_field(table)[0] query_path = split_field(table)[1:] metadata = self.default_es.get_metadata(index=index) for index, meta in qb.sort(metadata.indices.items(), {"value": 0, "sort": -1}): for _, properties in meta.mappings.items(): columns = _elasticsearch.parse_properties(index, None, properties.properties) columns = columns.filter(lambda r: not r.abs_name.startswith("other.") and not r.abs_name.startswith("previous_values.cf_")) # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED with Timer("upserting {{num}} columns", {"num": len(columns)}, debug=DEBUG): with self.columns.locker: for c in columns: # ABSOLUTE c.table = join_field([index]+query_path) self.upsert_column(c) for alias in meta.aliases: # ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS if alias in alias_done: continue alias_done.add(alias) c = copy(c) c.table = join_field([alias]+query_path) self.upsert_column(c) def query(self, _query): return self.columns.query(Query(set_default( { "from": self.columns, "sort": ["table", "name"] }, _query.as_dict() ))) def get_columns(self, table): """ RETURN METADATA COLUMNS """ with self.columns.locker: columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name") if columns: return columns self._get_columns(table=table) with self.columns.locker: columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name") if columns: return columns # self._get_columns(table=table) Log.error("no columns for {{table}}", table=table) def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in ["object", "nested"]: Log.error("not supported") try: if c.table == "meta.columns": with self.columns.locker: partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.columns, c.abs_name) if g[c.abs_name] != None]) self.columns.update({ "set": { "partitions": partitions, "count": len(self.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "abs_name": c.abs_name}} }) return if c.table == "meta.tables": with self.columns.locker: partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.tables, c.abs_name) if g[c.abs_name] != None]) self.columns.update({ "set": { "partitions": partitions, "count": len(self.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "name": c.name}} }) return es_index = c.table.split(".")[0] result = self.default_es.post("/"+es_index+"/_search", data={ "aggs": {c.name: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value) if cardinality == None: Log.error("logic error") query = Dict(size=0) if c.type in ["object", "nested"]: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"table": c.table, "name": c.name}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"table": c.table, "name": c.name}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"table": c.table, "name": c.name}} }) return elif c.nested_path: query.aggs[literal_field(c.name)] = { "nested": {"path": listwrap(c.nested_path)[0]}, "aggs": {"_nested": {"terms": {"field": c.abs_name, "size": 0}}} } else: query.aggs[literal_field(c.name)] = {"terms": {"field": c.abs_name, "size": 0}} result = self.default_es.post("/"+es_index+"/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = qb.sort(aggs._nested.buckets.key) else: parts = qb.sort(aggs.buckets.key) Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "abs_name": c.abs_name}} }) except Exception, e: if "IndexMissingException" in e and c.table.startswith("testing"): Log.alert("{{col.table}} does not exist", col=c) else: self.columns.update({ "set": { "last_updated": Date.now() }, "clear":[ "count", "cardinality", "partitions", ], "where": {"eq": {"table": c.table, "abs_name": c.abs_name}} }) Log.warning("Could not get {{col.table}}.{{col.abs_name}} info", col=c, cause=e)