def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD: continue with self.meta.columns.locker: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear":[ "count", "cardinality", "partitions", ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) if DEBUG: Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
def _db_load(self): self.last_load = Date.now() result = self._query( SQL_SELECT + "name" + SQL_FROM + "sqlite_master" + SQL_WHERE + SQL_AND.join(["name=" + db_table_name, "type=" + quote_value("table")]) ) if not result.data: self._db_create() return result = self._query( SQL_SELECT + all_columns + SQL_FROM + db_table_name + SQL_ORDERBY + sql_list(map(quote_column, ["es_index", "name", "es_column"])) ) with self.locker: for r in result.data: c = row_to_column(result.header, r) self._add(c)
def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.es_index, c.names["."]) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: if DEBUG: Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical is not c: set_default(c.names, canonical.names) for key in Column.__slots__: canonical[key] = c[key] if DEBUG: Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column) self.todo.add(canonical)
def get_branches(hg, branches, kwargs=None): # TRY ES cluster = elasticsearch.Cluster(branches) try: es = cluster.get_index(kwargs=branches, read_only=False) esq = jx_elasticsearch.new_instance(branches) found_branches = esq.query({"from": "branches", "format": "list", "limit": 10000}).data # IF IT IS TOO OLD, THEN PULL FROM HG oldest = Date(MAX(found_branches.etl.timestamp)) if oldest == None or Date.now() - oldest > OLD_BRANCH: found_branches = _get_branches_from_hg(hg) es.extend({"id": b.name + " " + b.locale, "value": b} for b in found_branches) es.flush() try: return UniqueIndex(["name", "locale"], data=found_branches, fail_on_dup=False) except Exception as e: Log.error("Bad branch in ES index", cause=e) except Exception as e: e = Except.wrap(e) if "Can not find index " in e: set_default(branches, {"schema": branches_schema}) es = cluster.get_or_create_index(branches) es.add_alias() return get_branches(kwargs) Log.error("problem getting branches", cause=e)
def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(kwargs=kwargs) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def _update_meta(self): if not self.dirty: return for mcl in self.data.get("meta.columns").values(): for mc in mcl: count = 0 values = set() objects = 0 multi = 1 for column in self._all_columns(): value = column[mc.name] if value == None: pass else: count += 1 if is_list(value): multi = max(multi, len(value)) try: values |= set(value) except Exception: objects += len(value) elif is_data(value): objects += 1 else: values.add(value) mc.count = count mc.cardinality = len(values) + objects mc.partitions = jx.sort(values) mc.multi = multi mc.last_updated = Date.now() self.dirty = False
def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field(query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table( name=es_index_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name)
def add_column(c, query_path): c.last_updated = Date.now() if query_path[0] != ".": c.names[query_path[0]] = relative_field(c.names["."], query_path[0]) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.es_index = alias self._upsert_column(c)
def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: with self.meta.columns.locker: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT ] if old_columns: if DEBUG: Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: if DEBUG: Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if DEBUG: Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column.type in STRUCT: with self.meta.columns.locker: column.last_updated = Date.now() continue elif column.last_updated >= Date.now()-TOO_OLD: continue try: self._update_cardinality(column) if DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX): Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e)
def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE table_path = split_field(table) es_index = table_path[0] query_path = join_field(table_path[1:]) meta = self.es_metadata.indices[es_index] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[es_index] for _, properties in meta.mappings.items(): properties.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(meta.index, properties, meta)
def test_mode_wait(query, please_stop): """ WAIT FOR METADATA TO ARRIVE ON INDEX :param query: dict() OF REQUEST BODY :return: nothing """ if not query["from"]: return try: if query["from"].startswith("meta."): return alias = split_field(query["from"])[0] after = Date.now() require_cardinality = meta.ENABLE_META_SCAN with Timer( "Get columns for {{table}} after {{after}}", { "table": alias, "after": after }, verbose=DEBUG, ): metadata_manager = find_container(alias, after=after).namespace timeout = Till(seconds=MINUTE.seconds) | please_stop while not timeout: # GET FRESH VERSIONS cols = metadata_manager.get_columns(table_name=alias, after=after, timeout=timeout) not_ready = [ c for c in cols if c.jx_type not in STRUCT and ( after >= c.last_updated or (require_cardinality and c.cardinality == None)) ] if not_ready: Log.note( "wait for column (table={{col.es_index}}, name={{col.es_column}}, cardinality={{col.cardinality|json}}, last_updated={{col.last_updated|datetime}}) metadata to arrive", col=first(not_ready), ) else: break Till(seconds=1).wait() except Exception as e: Log.warning("could not pickup columns", cause=e)
def _db_load(self): self.last_load = Date.now() try: self.es_index = self.es_cluster.get_index( id=ID, index=META_COLUMNS_NAME, type=META_COLUMNS_TYPE_NAME, read_only=False) result = self.es_index.search({ "query": { "bool": { "should": [ { "bool": { "must_not": { "exists": { "field": "cardinality.~n~" } } } }, { # ASSUME UNUSED COLUMNS DO NOT EXIST "range": { "cardinality.~n~": { "gt": 0 } } }, ] } }, "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"], "size": 10000, }) Log.note("{{num}} columns loaded", num=result.hits.total) with self.locker: for r in result.hits.hits._source: self._add(doc_to_column(r)) except Exception as e: Log.warning("no {{index}} exists, making one", index=META_COLUMNS_NAME, cause=e) self._db_create()
def setUpClass(self): # REMOVE OLD INDEXES cluster = elasticsearch.Cluster(test_jx.global_settings.backend_es) aliases = cluster.get_aliases() for a in aliases: try: if a.index.startswith("testing_"): create_time = Date( a.index[-15:], "%Y%m%d_%H%M%S" ) # EXAMPLE testing_0ef53e45b320160118_180420 if create_time < Date.now() - 10 * MINUTE: cluster.delete_index(a.index) except Exception, e: Log.warning("Problem removing {{index|quote}}", index=a.index, cause=e)
def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE table_path = split_field(table) es_index = table_path[0] query_path = join_field(table_path[1:]) meta = self.es_metadata.indices[es_index] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[es_index] for _, properties in meta.mappings.items(): properties.properties["_id"] = { "type": "string", "index": "not_analyzed" } self._parse_properties(meta.index, properties, meta)
def _daemon(self, please_stop): while not please_stop: with Explanation("looking for work"): try: branch, revisions = self.todo.pop(till=please_stop) except Exception as e: if please_stop: break else: raise e if branch.name in DAEMON_DO_NO_SCAN: continue revisions = set(revisions) # FIND THE REVSIONS ON THIS BRANCH for r in list(revisions): try: rev = self.get_revision( Revision(branch=branch, changeset={"id": r})) if DAEMON_DEBUG: Log.note( "found revision with push date {{date|datetime}}", date=rev.push.date) revisions.discard(r) if rev.etl.timestamp > Date.now( ) - DAEMON_RECENT_HG_PULL: # SOME PUSHES ARE BIG, RUNNING THE RISK OTHER MACHINES ARE # ALSO INTERESTED AND PERFORMING THE SAME SCAN. THIS DELAY # WILL HAVE SMALL EFFECT ON THE MAJORITY OF SMALL PUSHES # https://bugzilla.mozilla.org/show_bug.cgi?id=1417720 Till(seconds=Random.float( DAEMON_HG_INTERVAL).seconds * 2).wait() except Exception as e: Log.warning( "Scanning {{branch}} {{revision|left(12)}}", branch=branch.name, revision=r, cause=e) if "Read timed out" in e: Till(seconds=DAEMON_WAIT_AFTER_TIMEOUT.seconds ).wait() # FIND ANY BRANCH THAT MAY HAVE THIS REVISION for r in list(revisions): self._find_revision(r)
def output(*args, **kwargs): if kwargs: Log.error("Sorry, caching only works with ordered parameter, not keyword arguments") with cache_store.locker: if using_self: self = args[0] args = args[1:] else: self = cache_store now = Date.now() try: _cache = getattr(self, attr_name) except Exception: _cache = {} setattr(self, attr_name, _cache) if Random.int(100) == 0: # REMOVE OLD CACHE _cache = {k: v for k, v in _cache.items() if v.timeout == None or v.timeout > now} setattr(self, attr_name, _cache) timeout, key, value, exception = _cache.get(args, (Null, Null, Null, Null)) if now >= timeout: value = func(self, *args) with cache_store.locker: _cache[args] = CacheElement(now + cache_store.timeout, args, value, None) return value if value == None: if exception == None: try: value = func(self, *args) with cache_store.locker: _cache[args] = CacheElement(now + cache_store.timeout, args, value, None) return value except Exception as e: e = Except.wrap(e) with cache_store.locker: _cache[args] = CacheElement(now + cache_store.timeout, args, None, e) raise e else: raise exception else: return value
def _load_from_database(self): # FIND ALL TABLES result = self.db.query(sql_query({ "from": "sqlite_master", "where": {"eq": {"type": "table"}}, "orderby": "name" })) tables = wrap([{k: d for k, d in zip(result.header, row)} for row in result.data]) last_nested_path = ["."] for table in tables: if table.name.startswith("__"): continue base_table, nested_path = tail_field(table.name) # FIND COMMON NESTED PATH SUFFIX if nested_path == ".": last_nested_path = [] else: for i, p in enumerate(last_nested_path): if startswith_field(nested_path, p): last_nested_path = last_nested_path[i:] break else: last_nested_path = [] full_nested_path = [nested_path] + last_nested_path self._snowflakes[literal_field(base_table)] += [full_nested_path] # LOAD THE COLUMNS details = self.db.about(table.name) for cid, name, dtype, notnull, dfft_value, pk in details: if name.startswith("__"): continue cname, ctype = untyped_column(name) self.add(Column( name=cname, jx_type=coalesce(sql_type_to_json_type.get(ctype), IS_NULL), nested_path=full_nested_path, es_type=dtype, es_column=name, es_index=table.name, last_updated=Date.now() )) last_nested_path = full_nested_path
def __init__( self, hg=None, # CONNECT TO hg repo=None, # CONNECTION INFO FOR ES CACHE branches=None, # CONNECTION INFO FOR ES CACHE use_cache=False, # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES timeout=30 * SECOND, kwargs=None): if not _hg_branches: _late_imports() self.es_locker = Lock() self.todo = mo_threads.Queue("todo for hg daemon", max=DAEMON_QUEUE_SIZE) self.settings = kwargs self.timeout = Duration(timeout) # VERIFY CONNECTIVITY with Explanation("Test connect with hg"): response = http.head(self.settings.hg.url) if branches == None: self.branches = _hg_branches.get_branches(kwargs=kwargs) self.es = None return self.last_cache_miss = Date.now() set_default(repo, {"schema": revision_schema}) self.es = elasticsearch.Cluster(kwargs=repo).get_or_create_index( kwargs=repo) def setup_es(please_stop): with suppress_exception: self.es.add_alias() with suppress_exception: self.es.set_refresh_interval(seconds=1) Thread.run("setup_es", setup_es) self.branches = _hg_branches.get_branches(kwargs=kwargs) self.timeout = timeout Thread.run("hg daemon", self._daemon)
def __init__( self, hg=None, # CONNECT TO hg repo=None, # CONNECTION INFO FOR ES CACHE branches=None, # CONNECTION INFO FOR ES CACHE use_cache=False, # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES timeout=30 * SECOND, kwargs=None ): if not _hg_branches: _late_imports() self.es_locker = Lock() self.todo = mo_threads.Queue("todo for hg daemon", max=DAEMON_QUEUE_SIZE) self.settings = kwargs self.timeout = Duration(timeout) # VERIFY CONNECTIVITY with Explanation("Test connect with hg"): response = http.head(self.settings.hg.url) if branches == None: self.branches = _hg_branches.get_branches(kwargs=kwargs) self.es = None return self.last_cache_miss = Date.now() set_default(repo, {"schema": revision_schema}) self.es = elasticsearch.Cluster(kwargs=repo).get_or_create_index(kwargs=repo) def setup_es(please_stop): with suppress_exception: self.es.add_alias() with suppress_exception: self.es.set_refresh_interval(seconds=1) Thread.run("setup_es", setup_es) self.branches = _hg_branches.get_branches(kwargs=kwargs) self.timeout = timeout Thread.run("hg daemon", self._daemon)
def _db_load(self): self.last_load = Date.now() result = self._query( SQL_SELECT + "name" + SQL_FROM + "sqlite_master" + SQL_WHERE + SQL_AND.join( ["name=" + db_table_name, "type=" + quote_value("table")])) if not result.data: self._db_create() return result = self._query( SQL_SELECT + all_columns + SQL_FROM + db_table_name + SQL_ORDERBY + sql_list(map(quote_column, ["es_index", "name", "es_column"]))) with self.locker: for r in result.data: c = row_to_column(result.header, r) self._add(c)
def _daemon(self, please_stop): while not please_stop: with Explanation("looking for work"): try: branch, revisions = self.todo.pop(till=please_stop) except Exception as e: if please_stop: break else: raise e if branch.name in DAEMON_DO_NO_SCAN: continue revisions = set(revisions) # FIND THE REVSIONS ON THIS BRANCH for r in list(revisions): try: rev = self.get_revision(Revision(branch=branch, changeset={"id": r})) if DAEMON_DEBUG: Log.note("found revision with push date {{date|datetime}}", date=rev.push.date) revisions.discard(r) if rev.etl.timestamp > Date.now() - (DAEMON_RECENT_HG_PULL * SECOND): # SOME PUSHES ARE BIG, RUNNING THE RISK OTHER MACHINES ARE # ALSO INTERESTED AND PERFORMING THE SAME SCAN. THIS DELAY # WILL HAVE SMALL EFFECT ON THE MAJORITY OF SMALL PUSHES # https://bugzilla.mozilla.org/show_bug.cgi?id=1417720 Till(seconds=Random.float(DAEMON_HG_INTERVAL*2)).wait() except Exception as e: Log.warning( "Scanning {{branch}} {{revision|left(12)}}", branch=branch.name, revision=r, cause=e ) if "Read timed out" in e: Till(seconds=DAEMON_WAIT_AFTER_TIMEOUT).wait() # FIND ANY BRANCH THAT MAY HAVE THIS REVISION for r in list(revisions): self._find_revision(r)
def _load_from_database(self): # FIND ALL TABLES result = self.db.query( "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name") tables = wrap([{k: d for k, d in zip(result.header, row)} for row in result.data]) last_nested_path = [] for table in tables: if table.name.startswith("__"): continue base_table, nested_path = tail_field(table.name) # FIND COMMON NESTED PATH SUFFIX for i, p in enumerate(last_nested_path): if startswith_field(nested_path, p): last_nested_path = last_nested_path[i:] break else: last_nested_path = [] full_nested_path = [nested_path] + last_nested_path self._snowflakes[literal_field(base_table)] += [full_nested_path] # LOAD THE COLUMNS command = "PRAGMA table_info" + sql_iso(quote_column(table.name)) details = self.db.query(command) for cid, name, dtype, notnull, dfft_value, pk in details.data: if name.startswith("__"): continue cname, ctype = untyped_column(name) self.add( Column(name=cname, jx_type=coalesce(sql_type_to_json_type.get(ctype), IS_NULL), nested_path=full_nested_path, es_type=dtype, es_column=name, es_index=table.name, last_updated=Date.now())) last_nested_path = full_nested_path
def output(*args): with cache_store.locker: if using_self: self = args[0] args = args[1:] else: self = cache_store now = Date.now() try: _cache = getattr(self, attr_name) except Exception, _: _cache = {} setattr(self, attr_name, _cache) if Random.int(100) == 0: # REMOVE OLD CACHE _cache = {k: v for k, v in _cache.items() if v[0]==None or v[0] > now} setattr(self, attr_name, _cache) timeout, key, value, exception = _cache.get(args, (Null, Null, Null, Null))
def test_mode_wait(query): """ WAIT FOR METADATA TO ARRIVE ON INDEX :param query: dict() OF REQUEST BODY :return: nothing """ if not query["from"]: return try: if query["from"].startswith("meta."): return now = Date.now() alias = split_field(query["from"])[0] metadata_manager = find_container(alias).namespace metadata_manager.meta.tables[ alias].timestamp = now # TRIGGER A METADATA RELOAD AFTER THIS TIME timeout = Till(seconds=MINUTE.seconds) while not timeout: # GET FRESH VERSIONS cols = [ c for c in metadata_manager.get_columns( table_name=alias, after=now, timeout=timeout) if c.jx_type not in STRUCT ] for c in cols: if now >= c.last_updated: Log.note( "wait for column (table={{col.es_index}}, name={{col.es_column}}) metadata to arrive", col=c) break else: break Till(seconds=1).wait() except Exception as e: Log.warning("could not pickup columns", cause=e)
def __init__( self, service_url, # location of the ActiveData server we are testing backend_es, # the ElasticSearch settings for filling the backend sql_url=None, # location of the SQL service fast_testing=False, kwargs=None ): if backend_es.schema==None: Log.error("Expecting backed_es to have a schema defined") letters = unicode(ascii_lowercase) self.random_letter = letters[int(Date.now().unix / 30) % 26] self.service_url = service_url self.sql_url = sql_url self.backend_es = backend_es self.settings = kwargs self._es_test_settings = None self._es_cluster = None self._index = None if not containers.config.default: containers.config.default = { "type": "elasticsearch", "settings": backend_es } if not fast_testing: self.server = http else: Log.alert("TESTS WILL RUN FAST, BUT NOT ALL TESTS ARE RUN!\nEnsure the `file://tests/config/elasticsearch.json#fastTesting=true` to turn on the network response tests.") # WE WILL USE THE ActiveServer CODE, AND CONNECT TO ES DIRECTLY. # THIS MAKES FOR SLIGHTLY FASTER TEST TIMES BECAUSE THE PROXY IS # MISSING self.server = FakeHttp() containers.config.default = { "type": "elasticsearch", "settings": kwargs.backend_es.copy() }
def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(kwargs=kwargs) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now() - OLD_METADATA self.meta = Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def get_branches(hg, branches, kwargs=None): # TRY ES cluster = elasticsearch.Cluster(branches) try: es = cluster.get_index(kwargs=branches, read_only=False) esq = jx_elasticsearch.new_instance(branches) found_branches = esq.query({ "from": branches.index, "format": "list", "limit": 10000 }).data # IF IT IS TOO OLD, THEN PULL FROM HG oldest = Date(MAX(found_branches.etl.timestamp)) if oldest == None or Date.now() - oldest > OLD_BRANCH: found_branches = _get_branches_from_hg(hg) es.extend([{ "id": b.name + " " + b.locale, "value": b } for b in found_branches]) es.flush() try: return UniqueIndex(["name", "locale"], data=found_branches, fail_on_dup=False) except Exception as e: Log.error("Bad branch in ES index", cause=e) except Exception as e: e = Except.wrap(e) if "Can not find index " in e: branches.schema = branches_schema es = cluster.get_or_create_index(branches) es.add_alias() return get_branches(kwargs) Log.error("problem getting branches", cause=e)
def test_chunk_timing(self): if self.not_real_service(): return test = wrap({"query": { "from": { "type": "elasticsearch", "settings": { "host": ES_CLUSTER_LOCATION, "index": "unittest", "type": "test_result" } }, "select": {"value": "run.stats.duration", "aggregate": "average"}, "edges": [ {"name": "chunk", "value": ["run.suite", "run.chunk"]} ], "where": {"and": [ {"term": {"etl.id": 0}}, {"gte": {"timestamp": Date.floor(Date.now() - (Duration.DAY * 7), Duration.DAY).milli / 1000}} ]}, "format": "cube", "samples": { "limit": 30 } }}) query = convert.unicode2utf8(convert.value2json(test.query)) # EXECUTE QUERY with Timer("query"): response = self.utils.try_till_response(self.service_url, data=query) if response.status_code != 200: error(response) result = convert.json2value(convert.utf82unicode(response.all_content)) Log.note("result\n{{result|indent}}", {"result": result})
def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.abs_columns = set() self.last_es_metadata = Date.now() - OLD_METADATA self.meta = Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def work(please_stop): started.go() while not please_stop: acc.append(Date.now().unix) Till(seconds=0.1).wait()
}) try: query = wrap(result.data).query if len(query) == 0: return None except Exception, e: return None self.es.update({ "update": { "type": "elasticsearch", "settings": self.es.settings }, "set": { "last_used": Date.now() }, "where": { "eq": { "hash": hash } } }) return query[0] def save(self, query): query.meta = None json = convert.value2json(query) hash = convert.unicode2utf8(json)
def _get_schema_from_list( frum, # The list table_name, # Name of the table this list holds records for parent, # parent path nested_path, # each nested array, in reverse order columns, # map from full name to column definition native_type_to_json_type # dict from storage type name to json type name ): for d in frum: row_type = python_type_to_json_type[d.__class__] if row_type != "object": # EXPECTING PRIMITIVE VALUE full_name = parent column = columns[full_name] if not column: column = Column( name=concat_field(table_name, full_name), es_column=full_name, es_index=".", es_type=d.__class__.__name__, jx_type=None, # WILL BE SET BELOW last_updated=Date.now(), nested_path=nested_path, ) columns.add(column) column.es_type = _merge_python_type(column.es_type, d.__class__) column.jx_type = native_type_to_json_type[column.es_type] else: for name, value in d.items(): full_name = concat_field(parent, name) column = columns[full_name] if not column: column = Column( name=concat_field(table_name, full_name), es_column=full_name, es_index=".", es_type=value.__class__.__name__, jx_type=None, # WILL BE SET BELOW last_updated=Date.now(), nested_path=nested_path, ) columns.add(column) if is_container(value): # GET TYPE OF MULTIVALUE v = list(value) if len(v) == 0: this_type = none_type.__name__ elif len(v) == 1: this_type = v[0].__class__.__name__ else: this_type = reduce(_merge_python_type, (vi.__class__.__name__ for vi in value)) else: this_type = value.__class__.__name__ column.es_type = _merge_python_type(column.es_type, this_type) try: column.jx_type = native_type_to_json_type[column.es_type] except Exception as e: raise e if this_type in {"object", "dict", "Mapping", "Data"}: _get_schema_from_list([value], table_name, full_name, nested_path, columns, native_type_to_json_type) elif this_type in {"list", "FlatList"}: np = listwrap(nested_path) newpath = unwraplist( [join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list(value, table_name, full_name, newpath, columns)
_get_schema_from_list(value, table_name, full_name, newpath, columns) def get_id(column): """ :param column: :return: Elasticsearch id for column """ return column.es_index + "|" + column.es_column META_COLUMNS_DESC = TableDesc(name=META_COLUMNS_NAME, url=None, query_path=ROOT_PATH, last_updated=Date.now(), columns=wrap([ Column( name=c, es_index=META_COLUMNS_NAME, es_column=c, es_type="keyword", jx_type=STRING, last_updated=Date.now(), nested_path=ROOT_PATH, ) for c in [ "name", "es_type", "jx_type", "nested_path", "es_column",
def extract(self, settings, force, restart, start, merge): if not settings.extractor.app_name: Log.error("Expecting an extractor.app_name in config file") # SETUP DESTINATION destination = bigquery.Dataset( dataset=settings.extractor.app_name, kwargs=settings.destination).get_or_create_table( settings.destination) try: if merge: with Timer("merge shards"): destination.merge_shards() # RECOVER LAST SQL STATE redis = Redis.from_url(REDIS_URL) state = redis.get(settings.extractor.key) if start: state = start, 0 elif restart or not state: state = (0, 0) redis.set(settings.extractor.key, value2json(state).encode("utf8")) else: state = json2value(state.decode("utf8")) last_modified, job_id = state # SCAN SCHEMA, GENERATE EXTRACTION SQL extractor = MySqlSnowflakeExtractor(settings.source) canonical_sql = extractor.get_sql(SQL("SELECT 0")) # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN old_sql = redis.get(settings.extractor.sql) if old_sql and old_sql.decode("utf8") != canonical_sql.sql: if force: Log.warning("Schema has changed") else: Log.error("Schema has changed") redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8")) # SETUP SOURCE source = MySQL(settings.source.database) while True: Log.note( "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}", last_modified=last_modified, job_id=job_id, ) # Example: job.id ==283890114 # get_ids = ConcatSQL( # (SQL_SELECT, sql_alias(quote_value(283890114), "id")) # ) get_ids = sql_query({ "from": "job", "select": ["id"], "where": { "or": [ { "gt": { "last_modified": Date(last_modified) } }, { "and": [ { "eq": { "last_modified": Date(last_modified) } }, { "gt": { "id": job_id } }, ] }, ] }, "sort": ["last_modified", "id"], "limit": settings.extractor.chunk_size, }) sql = extractor.get_sql(get_ids) # PULL FROM source, AND PUSH TO destination acc = [] with source.transaction(): cursor = source.query(sql, stream=True, row_tuples=True) extractor.construct_docs(cursor, acc.append, False) if not acc: break # SOME LIMITS PLACES ON STRING SIZE for fl in jx.drill(acc, "job_log.failure_line"): fl.message = strings.limit(fl.message, 10000) for r in acc: r.etl.timestamp = Date.now() destination.extend(acc) # RECORD THE STATE last_doc = acc[-1] last_modified, job_id = last_doc.last_modified, last_doc.id redis.set( settings.extractor.key, value2json((last_modified, job_id)).encode("utf8"), ) if len(acc) < settings.extractor.chunk_size: break except Exception as e: Log.warning("problem with extraction", cause=e) Log.note("done job extraction") try: with Timer("merge shards"): destination.merge_shards() except Exception as e: Log.warning("problem with merge", cause=e) Log.note("done job merge")
def doc_to_column(doc): now = Date.now() try: doc = to_data(untyped(doc)) # I HAVE MANAGED TO MAKE MANY MISTAKES WRITING COLUMNS TO ES. HERE ARE THE FIXES # FIX if not doc.last_updated: doc.last_updated = Date.now() - YEAR # FIX if doc.es_type == None: if doc.jx_type == OBJECT: doc.es_type = "object" else: Log.warning("{{doc}} has no es_type", doc=doc) # FIX if doc.es_type == "nested": doc.multi = 1001 if doc.multi == None: doc.multi = 1 # FIX if doc.es_column.endswith("." + NESTED_TYPE): if doc.jx_type == OBJECT: doc.jx_type = NESTED doc.last_updated = now if doc.es_type == "nested": doc.es_type = "nested" doc.last_updated = now # FIX doc.nested_path = tuple(listwrap(doc.nested_path)) if last(split_field( doc.es_column)) == NESTED_TYPE and doc.es_type != "nested": doc.es_type = "nested" doc.jx_type = NESTED doc.multi = 1001 doc.last_updated = now # FIX expected_nested_path = get_nested_path(doc.es_column) if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.': doc.nested_path = doc.nested_path[:-1] doc.last_updated = now # FIX if untype_path(doc.es_column) == doc.es_column: if doc.nested_path != (".", ): if doc.es_index in {"repo"}: pass else: Log.note("not expected") doc.nested_path = expected_nested_path doc.last_updated = now else: if doc.nested_path != expected_nested_path: doc.nested_path = expected_nested_path doc.last_updated = now # FIX if last(split_field(doc.es_column)) == EXISTS_TYPE: if doc.jx_type != EXISTS: doc.jx_type = EXISTS doc.last_updated = now if doc.cardinality == None: doc.cardinality = 1 doc.last_updated = now # FIX if doc.jx_type in STRUCT: if doc.cardinality not in [0, 1]: doc.cardinality = 1 # DO NOT KNOW IF EXISTS OR NOT doc.last_updated = now return Column(**doc) except Exception as e: try: mark_as_deleted(Column(**doc), now) except Exception: pass return None
def update(self, command): self.dirty = True try: command = to_data(command) DEBUG and Log.note( "Update {{timestamp}}: {{command|json}}", command=command, timestamp=Date(command["set"].last_updated), ) eq = command.where.eq if eq.es_index: if len(eq) == 1: if unwraplist(command.clear) == ".": d = self.data i = eq.es_index with self.locker: cols = d[i] del d[i] for c in cols: self.remove(c) return # FASTEST all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [c for cs in all_columns for c in cs] elif eq.es_column and len(eq) == 2: # FASTER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if c.es_column == eq.es_column ] else: # SLOWER all_columns = self.data.get(eq.es_index, {}).values() with self.locker: columns = [ c for cs in all_columns for c in cs if all(c[k] == v for k, v in eq.items()) # THIS LINE IS VERY SLOW ] else: columns = list(self) columns = jx.filter(columns, command.where) with self.locker: for col in columns: DEBUG and Log.note( "update column {{table}}.{{column}}", table=col.es_index, column=col.es_column, ) for k in command["clear"]: if k == ".": mark_as_deleted(col, Date.now()) self.for_es_update.add(col) lst = self.data[col.es_index] cols = lst[col.name] cols.remove(col) if len(cols) == 0: del lst[col.name] if len(lst) == 0: del self.data[col.es_index] break else: col[k] = None else: # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES for k, v in command.set.items(): col[k] = v self.for_es_update.add(col) except Exception as e: Log.error("should not happen", cause=e)
np = listwrap(nested_path) newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list( value, table_name, full_name, newpath, columns ) METADATA_COLUMNS = ( [ Column( name=c, es_index="meta.columns", es_column=c, es_type="keyword", jx_type=STRING, last_updated=Date.now(), nested_path=ROOT_PATH, ) for c in [ "name", "es_type", "jx_type", "nested_path", "es_column", "es_index", "partitions", ] ] + [ Column( name=c,
def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in STRUCT: Log.error("not supported") try: if c.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return if c.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return es_index = c.es_index.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": {c.names["."]: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None) if cardinality == None: Log.error("logic error") query = Data(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif len(c.nested_path) != 1: query.aggs[literal_field(c.names["."])] = { "nested": {"path": c.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}} } else: query.aggs[literal_field(c.names["."])] = {"terms": {"field": c.es_column, "size": 0}} result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) except Exception as e: if "IndexMissingException" in e and c.es_index.startswith(TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear":[ "partitions" ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": {"eq": {"names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e)
def _normalize_revision(self, r, found_revision, push, get_diff, get_moves): new_names = set(r.keys()) - KNOWN_TAGS if new_names and not r.tags: Log.warning( "hg is returning new property names {{names|quote}} for {{changeset}} from {{url}}", names=new_names, changeset=r.node, url=found_revision.branch.url ) changeset = Changeset( id=r.node, id12=r.node[0:12], author=r.user, description=strings.limit(coalesce(r.description, r.desc), 2000), date=parse_hg_date(r.date), files=r.files, backedoutby=r.backedoutby if r.backedoutby else None, bug=self._extract_bug_id(r.description) ) rev = Revision( branch=found_revision.branch, index=r.rev, changeset=changeset, parents=unwraplist(list(set(r.parents))), children=unwraplist(list(set(r.children))), push=push, phase=r.phase, bookmarks=unwraplist(r.bookmarks), landingsystem=r.landingsystem, etl={"timestamp": Date.now().unix, "machine": machine_metadata} ) r.pushuser = None r.pushdate = None r.pushid = None r.node = None r.user = None r.desc = None r.description = None r.date = None r.files = None r.backedoutby = None r.parents = None r.children = None r.bookmarks = None r.landingsystem = None set_default(rev, r) # ADD THE DIFF if get_diff: rev.changeset.diff = self._get_json_diff_from_hg(rev) if get_moves: rev.changeset.moves = self._get_moves_from_hg(rev) try: _id = coalesce(rev.changeset.id12, "") + "-" + rev.branch.name + "-" + coalesce(rev.branch.locale, DEFAULT_LOCALE) with self.es_locker: self.es.add({"id": _id, "value": rev}) except Exception as e: e = Except.wrap(e) Log.warning("Did not save to ES, waiting {{duration}} seconds", duration=WAIT_AFTER_NODE_FAILURE, cause=e) Till(seconds=WAIT_AFTER_NODE_FAILURE).wait() if "FORBIDDEN/12/index read-only" in e: pass # KNOWN FAILURE MODE return rev
def doc_to_column(doc): kwargs = set_default(untyped(doc), {"last_updated": Date.now() - YEAR}) return Column(**wrap(kwargs))
def get_revision(self, revision, locale=None, get_diff=False, get_moves=True): """ EXPECTING INCOMPLETE revision OBJECT RETURNS revision """ rev = revision.changeset.id if not rev: return Null elif rev == "None": return Null elif revision.branch.name == None: return Null locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE) output = self._get_from_elasticsearch(revision, locale=locale, get_diff=get_diff) if output: if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None if not get_moves: output.changeset.moves = None DEBUG and Log.note("Got hg ({{branch}}, {{locale}}, {{revision}}) from ES", branch=output.branch.name, locale=locale, revision=output.changeset.id) if output.push.date >= Date.now()-MAX_TODO_AGE: self.todo.add((output.branch, listwrap(output.parents))) self.todo.add((output.branch, listwrap(output.children))) if output.push.date: return output # RATE LIMIT CALLS TO HG (CACHE MISSES) next_cache_miss = self.last_cache_miss + (Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND) self.last_cache_miss = Date.now() if next_cache_miss > self.last_cache_miss: Log.note("delaying next hg call for {{seconds|round(decimal=1)}}", seconds=next_cache_miss - self.last_cache_miss) Till(till=next_cache_miss.unix).wait() found_revision = copy(revision) if isinstance(found_revision.branch, (text_type, binary_type)): lower_name = found_revision.branch.lower() else: lower_name = found_revision.branch.name.lower() if not lower_name: Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch) b = found_revision.branch = self.branches[(lower_name, locale)] if not b: b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)] if not b: Log.warning("can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale) return Null if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH: self.branches = _hg_branches.get_branches(kwargs=self.settings) push = self._get_push(found_revision.branch, found_revision.changeset.id) url1 = found_revision.branch.url.rstrip("/") + "/json-info?node=" + found_revision.changeset.id[0:12] url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + found_revision.changeset.id[0:12] with Explanation("get revision from {{url}}", url=url1, debug=DEBUG): raw_rev2 = Null try: raw_rev1 = self._get_raw_json_info(url1, found_revision.branch) raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch) except Exception as e: if "Hg denies it exists" in e: raw_rev1 = Data(node=revision.changeset.id) else: raise e output = self._normalize_revision(set_default(raw_rev1, raw_rev2), found_revision, push, get_diff, get_moves) if output.push.date >= Date.now()-MAX_TODO_AGE: self.todo.add((output.branch, listwrap(output.parents))) self.todo.add((output.branch, listwrap(output.children))) if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None if not get_moves: output.changeset.moves = None return output
def proto_name(prefix, timestamp=None): if not timestamp: timestamp = Date.now() else: timestamp = Date(timestamp) return prefix + timestamp.format(INDEX_DATE_FORMAT)
def _get_schema_from_list(frum, table_name, parent, nested_path, columns): """ :param frum: The list :param table_name: Name of the table this list holds records for :param parent: parent path :param nested_path: each nested array, in reverse order :param columns: map from full name to column definition :return: """ for d in frum: row_type = python_type_to_json_type[d.__class__] if row_type != "object": # EXPECTING PRIMITIVE VALUE full_name = parent column = columns[full_name] if not column: column = Column( name=concat_field(table_name, full_name), es_column=full_name, es_index=".", es_type=d.__class__.__name__, jx_type=None, # WILL BE SET BELOW last_updated=Date.now(), nested_path=nested_path, ) columns.add(column) column.es_type = _merge_python_type(column.es_type, d.__class__) column.jx_type = python_type_to_json_type[column.es_type] else: for name, value in d.items(): full_name = concat_field(parent, name) column = columns[full_name] if not column: column = Column( name=concat_field(table_name, full_name), es_column=full_name, es_index=".", es_type=value.__class__.__name__, jx_type=None, # WILL BE SET BELOW last_updated=Date.now(), nested_path=nested_path, ) columns.add(column) if is_container(value): # GET TYPE OF MULTIVALUE v = list(value) if len(v) == 0: this_type = none_type.__name__ elif len(v) == 1: this_type = v[0].__class__.__name__ else: this_type = reduce( _merge_python_type, (vi.__class__.__name__ for vi in value) ) else: this_type = value.__class__.__name__ column.es_type = _merge_python_type(column.es_type, this_type) column.jx_type = python_type_to_json_type[column.es_type] if this_type in {"object", "dict", "Mapping", "Data"}: _get_schema_from_list( [value], table_name, full_name, nested_path, columns ) elif this_type in {"list", "FlatList"}: np = listwrap(nested_path) newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list( value, table_name, full_name, newpath, columns )
def _get_single_branch_from_hg(settings, description, dir): if dir == "users": return [] response = http.get(settings.url + "/" + dir) doc = BeautifulSoup(response.all_content, "html.parser") output = [] try: all_branches = doc("table")[0] except Exception: return [] for i, b in enumerate(all_branches("tr")): if i == 0: continue # IGNORE HEADER columns = b("td") try: path = columns[0].a.get('href') if path == "/": continue name, desc, last_used = [c.text.strip() for c in columns][0:3] if last_used.startswith('at'): last_used = last_used[2:] detail = Data( name=name.lower(), locale=DEFAULT_LOCALE, parent_name=description, url=settings.url + path, description=desc, last_used=Date(last_used), etl={"timestamp": Date.now()} ) if detail.description == "unknown": detail.description = None # SOME BRANCHES HAVE NAME COLLISIONS, IGNORE LEAST POPULAR if path in [ "/projects/dxr/", # moved to webtools "/build/compare-locales/", # ?build team likes to clone? "/build/puppet/", # ?build team likes to clone? "/SeaMonkey/puppet/", # looses the popularity contest "/releases/gaia-l10n/v1_2/en-US/", # use default branch "/releases/gaia-l10n/v1_3/en-US/", # use default branch "/releases/gaia-l10n/v1_4/en-US/", # use default branch "/releases/gaia-l10n/v2_0/en-US/", # use default branch "/releases/gaia-l10n/v2_1/en-US/", # use default branch "/build/autoland/" ]: continue # MARKUP BRANCH IF LOCALE SPECIFIC if path.startswith("/l10n-central"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "mozilla-central" elif path.startswith("/releases/l10n/"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = _path[-2].lower() elif path.startswith("/releases/gaia-l10n/"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "gaia-" + _path[-2][1::] elif path.startswith("/weave-l10n"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "weave" if BRANCH_WHITELIST is not None: found = False for br in BRANCH_WHITELIST: if br in str(detail.name): found = True break if not found: continue Log.note("Branch {{name}} {{locale}}", name=detail.name, locale=detail.locale) output.append(detail) except Exception as e: Log.warning("branch digestion problem", cause=e) return output
def get_revision(self, revision, locale=None, get_diff=False): """ EXPECTING INCOMPLETE revision OBJECT RETURNS revision """ rev = revision.changeset.id if not rev: return Null elif rev == "None": return Null elif revision.branch.name == None: return Null locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE) output = self._get_from_elasticsearch(revision, locale=locale, get_diff=get_diff) if output: if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None if DEBUG: Log.note("Got hg ({{branch}}, {{locale}}, {{revision}}) from ES", branch=output.branch.name, locale=locale, revision=output.changeset.id) if output.push.date >= Date.now()-MAX_TODO_AGE: self.todo.add((output.branch, listwrap(output.parents))) self.todo.add((output.branch, listwrap(output.children))) if output.push.date: return output found_revision = copy(revision) if isinstance(found_revision.branch, (text_type, binary_type)): lower_name = found_revision.branch.lower() else: lower_name = found_revision.branch.name.lower() if not lower_name: Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch) b = found_revision.branch = self.branches[(lower_name, locale)] if not b: b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)] if not b: Log.error("can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale) if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH: self.branches = _hg_branches.get_branches(kwargs=self.settings) push = self._get_push(found_revision.branch, found_revision.changeset.id) url1 = found_revision.branch.url.rstrip("/") + "/json-info?node=" + found_revision.changeset.id[0:12] url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + found_revision.changeset.id[0:12] with Explanation("get revision from {{url}}", url=url1, debug=DEBUG): raw_rev2 = Null try: raw_rev1 = self._get_raw_json_info(url1, found_revision.branch) raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch) except Exception as e: if "Hg denies it exists" in e: raw_rev1 = Data(node=revision.changeset.id) else: raise e output = self._normalize_revision(set_default(raw_rev1, raw_rev2), found_revision, push, get_diff) if output.push.date >= Date.now()-MAX_TODO_AGE: self.todo.add((output.branch, listwrap(output.parents))) self.todo.add((output.branch, listwrap(output.children))) if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None return output
def take_lock(please_stop): with locker: locker.wait(Till(seconds=1)) locker.wait(Till(seconds=1)) locker.wait(Till(till=(Date.now()+SECOND).unix))
def mark_as_deleted(col): col.count = 0 col.cardinality = 0 col.multi = 0 col.partitions = None col.last_updated = Date.now()
def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in STRUCT: Log.error("not supported") try: if c.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return if c.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return es_index = c.es_index.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": {c.names["."]: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count == 0 else None) if cardinality == None: Log.error("logic error") query = Data(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif len(c.nested_path) != 1: query.aggs[literal_field(c.names["."])] = { "nested": {"path": c.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}} } else: query.aggs[literal_field(c.names["."])] = {"terms": {"field": c.es_column, "size": 0}} result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=c.names["."], parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) except Exception as e: if "IndexMissingException" in e and c.es_index.startswith(TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear":[ "partitions" ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": {"eq": {"names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e)
def _get_single_branch_from_hg(settings, description, dir): if dir == "users": return [] response = http.get(settings.url + "/" + dir) doc = BeautifulSoup(response.all_content, "html.parser") output = [] try: all_branches = doc("table")[0] except Exception: return [] for i, b in enumerate(all_branches("tr")): if i == 0: continue # IGNORE HEADER columns = b("td") try: path = columns[0].a.get('href') if path == "/": continue name, desc, last_used = [c.text.strip() for c in columns][0:3] if last_used.startswith('at'): last_used = last_used[2:] detail = Data( name=name.lower(), locale=DEFAULT_LOCALE, parent_name=description, url=settings.url + path, description=desc, last_used=Date(last_used), etl={"timestamp": Date.now()} ) if detail.description == "unknown": detail.description = None # SOME BRANCHES HAVE NAME COLLISIONS, IGNORE LEAST POPULAR if path in [ "/projects/dxr/", # moved to webtools "/build/compare-locales/", # ?build team likes to clone? "/build/puppet/", # ?build team likes to clone? "/SeaMonkey/puppet/", # looses the popularity contest "/releases/gaia-l10n/v1_2/en-US/", # use default branch "/releases/gaia-l10n/v1_3/en-US/", # use default branch "/releases/gaia-l10n/v1_4/en-US/", # use default branch "/releases/gaia-l10n/v2_0/en-US/", # use default branch "/releases/gaia-l10n/v2_1/en-US/", # use default branch "/build/autoland/" ]: continue # MARKUP BRANCH IF LOCALE SPECIFIC if path.startswith("/l10n-central"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "mozilla-central" elif path.startswith("/releases/l10n/"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = _path[-2].lower() elif path.startswith("/releases/gaia-l10n/"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "gaia-" + _path[-2][1::] elif path.startswith("/weave-l10n"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "weave" Log.note("Branch {{name}} {{locale}}", name=detail.name, locale=detail.locale) output.append(detail) except Exception as e: Log.warning("branch digestion problem", cause=e) return output
def columns(self): if not self._columns: now = Date.now() columns = [] def parse_schema(schema, tops, es_type_info, jx_path, nested_path, es_path): if is_text(schema): json_type = schema expected_es_type = json_type_to_bq_type[json_type] if es_type_info and es_type_info != expected_es_type: Log.error( "expecting {{path}} to be of type {{expected_type}} not of type {{observed_type}}", path=jx_path, expected_type=expected_es_type, observed_type=es_type_info ) c = jx_base.Column( name=join_field(jx_path), es_column=coalesce(tops, text(es_path)), es_index=self.es_index, es_type=coalesce(es_type_info, expected_es_type), jx_type=json_type, nested_path=nested_path, last_updated=now, ) columns.append(c) else: c = jx_base.Column( name=join_field(jx_path), es_column=text(es_path), es_index=self.es_index, es_type="RECORD", jx_type=OBJECT, nested_path=nested_path, last_updated=now, ) columns.append(c) count = len(columns) for k, s in schema.items(): if k == NESTED_TYPE: c.jx_type = NESTED parse_schema( s, tops if is_text(tops) else tops[k], es_type_info if is_text(es_type_info) else es_type_info[k], jx_path + (k,), (jx_path,) + nested_path, es_path + escape_name(k), ) else: parse_schema( s, tops if is_text(tops) else tops[k], es_type_info if is_text(es_type_info) else es_type_info[k], jx_path + (k,), nested_path, es_path + escape_name(k), ) if is_text(tops) and len(columns) > count + 1: Log.error( "too many top level fields at {{field}}:", field=join_field(jx_path), ) parse_schema( self.schema, self.top_level_fields, self._es_type_info, (), (".",), ApiName(), ) self._columns = columns self._top_level_fields = {} for path, field in wrap(self.top_level_fields).leaves(): leaves = self.leaves(path) if not leaves: continue if len(leaves) > 1: Log.error( "expecting {{path}} to have just one primitive value", path=path ) specific_path = first(leaves).name self._top_level_fields[ ".".join(map(text, map(escape_name, split_field(specific_path)))) ] = field self._partition = Partition(kwargs=self.partition, flake=self) return self._columns