def _daemon(self, please_stop): while not please_stop: with Explanation("looking for work"): try: branch, revisions = self.todo.pop(till=please_stop) except Exception as e: if please_stop: break else: raise e if branch.name in DAEMON_DO_NO_SCAN: continue revisions = set(revisions) # FIND THE REVSIONS ON THIS BRANCH for r in list(revisions): with Explanation("Scanning {{branch}} {{revision|left(12)}}", branch=branch.name, revision=r, debug=DAEMON_DEBUG): rev = self.get_revision(Revision(branch=branch, changeset={"id": r})) if DAEMON_DEBUG: Log.note("found revision with push date {{date|datetime}}", date=rev.push.date) revisions.discard(r) # FIND ANY BRANCH THAT MAY HAVE THIS REVISION for r in list(revisions): self._find_revision(r)
def __init__(self, kwargs=None): self.settings = kwargs excludes = listwrap(self.settings.exclude) self.settings.exclude = set(e for e in excludes if len(split_field(e)) == 1) self.settings.exclude_columns = set(p for e in excludes for p in [tuple(split_field(e))] if len(p) > 1) self.settings.exclude_path = list( map(split_field, listwrap(self.settings.exclude_path))) self.settings.show_foreign_keys = coalesce( self.settings.show_foreign_keys, True) self.name_relations = unwrap(coalesce(self.settings.name_relations, {})) self.all_nested_paths = None self.nested_path_to_join = None self.columns = None with Explanation("scan database", debug=DEBUG): self.db = MySQL(**kwargs.database) self.settings.database.schema = self.db.settings.schema with self.db.transaction(): self._scan_database() if not self.settings.database.schema: Log.error("you must provide a `database.schema`")
def _get_url(url, branch, **kwargs): with Explanation("get push from {{url}}", url=url, debug=DEBUG): response = http.get(url, **kwargs) data = json2value(response.content.decode("utf8")) if isinstance(data, (text_type, str)) and data.startswith("unknown revision"): Log.error("Unknown push {{revision}}", revision=strings.between(data, "'", "'")) branch.url = _trim(url) # RECORD THIS SUCCESS IN THE BRANCH return data
def _daemon(self, please_stop): while not please_stop: with Explanation("looking for work"): try: branch, revisions, after = self.todo.pop(till=please_stop) except Exception as e: if please_stop: break else: raise e if branch.name in DAEMON_DO_NO_SCAN: continue revisions = set(revisions) # FIND THE REVSIONS ON THIS BRANCH for r in list(revisions): try: rev = self.get_revision( Revision(branch=branch, changeset={"id": r}), None, # local False, # get_diff True, # get_moves ) if after and after > rev.etl.timestamp: rev = self._get_from_hg(revision=rev) if DAEMON_DEBUG: Log.note( "found revision with push date {{date|datetime}}", date=rev.push.date, ) revisions.discard(r) if rev.etl.timestamp > Date.now() - ( DAEMON_RECENT_HG_PULL * SECOND): # SOME PUSHES ARE BIG, RUNNING THE RISK OTHER MACHINES ARE # ALSO INTERESTED AND PERFORMING THE SAME SCAN. THIS DELAY # WILL HAVE SMALL EFFECT ON THE MAJORITY OF SMALL PUSHES # https://bugzilla.mozilla.org/show_bug.cgi?id=1417720 Till(seconds=Random.float(DAEMON_HG_INTERVAL * 2)).wait() except Exception as e: Log.warning( "Scanning {{branch}} {{revision|left(12)}}", branch=branch.name, revision=r, cause=e, ) if "Read timed out" in e: Till(seconds=DAEMON_WAIT_AFTER_TIMEOUT).wait() # FIND ANY BRANCH THAT MAY HAVE THIS REVISION for r in list(revisions): self._find_revision(r)
def _get_url(url, branch, **kwargs): with Explanation("get push from {{url}}", url=url, debug=DEBUG): response = http.get(url, **kwargs) data = json2value(response.content.decode("utf8")) if data.error.startswith("unknown revision"): Log.error(UNKNOWN_PUSH, revision=strings.between(data.error, "'", "'")) if is_text(data) and data.startswith("unknown revision"): Log.error(UNKNOWN_PUSH, revision=strings.between(data, "'", "'")) # branch.url = _trim(url) # RECORD THIS SUCCESS IN THE BRANCH return data
def _get_push(self, branch, changeset_id): if self.es.cluster.version.startswith("1.7."): query = { "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"term": {"branch.name": branch.name}}, {"prefix": {"changeset.id": changeset_id[0:12]}} ]} }}, "size": 1 } else: query = { "query": {"bool": {"must": [ {"term": {"branch.name": branch.name}}, {"prefix": {"changeset.id": changeset_id[0:12]}} ]}}, "size": 1 } try: # ALWAYS TRY ES FIRST with self.es_locker: response = self.es.search(query) json_push = response.hits.hits[0]._source.push if json_push: return json_push except Exception: pass url = branch.url.rstrip("/") + "/json-pushes?full=1&changeset=" + changeset_id with Explanation("Pulling pushlog from {{url}}", url=url, debug=DEBUG): Log.note( "Reading pushlog from {{url}}", url=url, changeset=changeset_id ) data = self._get_and_retry(url, branch) # QUEUE UP THE OTHER CHANGESETS IN THE PUSH self.todo.add((branch, [c.node for cs in data.values().changesets for c in cs])) pushes = [ Push(id=int(index), date=_push.date, user=_push.user) for index, _push in data.items() ] if len(pushes) == 0: return Null elif len(pushes) == 1: return pushes[0] else: Log.error("do not know what to do")
def __init__(self, kwargs=None): self.settings = kwargs self.settings.exclude = set(self.settings.exclude) self.settings.show_foreign_keys = coalesce( self.settings.show_foreign_keys, True) self.all_nested_paths = None self.nested_path_to_join = None self.columns = None with Explanation("scan database", debug=DEBUG): self.db = MySQL(**kwargs.database) with self.db: with self.db.transaction(): self._scan_database()
def __init__( self, hg=None, # CONNECT TO hg repo=None, # CONNECTION INFO FOR ES CACHE branches=None, # CONNECTION INFO FOR ES CACHE use_cache=False, # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES timeout=30 * SECOND, kwargs=None): if not _hg_branches: _late_imports() self.es_locker = Lock() self.todo = mo_threads.Queue("todo for hg daemon", max=DAEMON_QUEUE_SIZE) self.settings = kwargs self.timeout = Duration(timeout) # VERIFY CONNECTIVITY with Explanation("Test connect with hg"): response = http.head(self.settings.hg.url) if branches == None: self.branches = _hg_branches.get_branches(kwargs=kwargs) self.es = None return self.last_cache_miss = Date.now() set_default(repo, {"schema": revision_schema}) self.es = elasticsearch.Cluster(kwargs=repo).get_or_create_index( kwargs=repo) def setup_es(please_stop): with suppress_exception: self.es.add_alias() with suppress_exception: self.es.set_refresh_interval(seconds=1) Thread.run("setup_es", setup_es) self.branches = _hg_branches.get_branches(kwargs=kwargs) self.timeout = timeout Thread.run("hg daemon", self._daemon)
def get_revision(self, revision, locale=None, get_diff=False): """ EXPECTING INCOMPLETE revision OBJECT RETURNS revision """ rev = revision.changeset.id if not rev: return Null elif rev == "None": return Null elif revision.branch.name == None: return Null locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE) output = self._get_from_elasticsearch(revision, locale=locale, get_diff=get_diff) if output: if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None if DEBUG: Log.note("Got hg ({{branch}}, {{locale}}, {{revision}}) from ES", branch=output.branch.name, locale=locale, revision=output.changeset.id) if output.push.date >= Date.now()-MAX_TODO_AGE: self.todo.add((output.branch, listwrap(output.parents))) self.todo.add((output.branch, listwrap(output.children))) if output.push.date: return output found_revision = copy(revision) if isinstance(found_revision.branch, (text_type, binary_type)): lower_name = found_revision.branch.lower() else: lower_name = found_revision.branch.name.lower() if not lower_name: Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch) b = found_revision.branch = self.branches[(lower_name, locale)] if not b: b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)] if not b: Log.error("can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale) if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH: self.branches = _hg_branches.get_branches(kwargs=self.settings) push = self._get_push(found_revision.branch, found_revision.changeset.id) url1 = found_revision.branch.url.rstrip("/") + "/json-info?node=" + found_revision.changeset.id[0:12] url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + found_revision.changeset.id[0:12] with Explanation("get revision from {{url}}", url=url1, debug=DEBUG): raw_rev2 = Null try: raw_rev1 = self._get_raw_json_info(url1, found_revision.branch) raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch) except Exception as e: if "Hg denies it exists" in e: raw_rev1 = Data(node=revision.changeset.id) else: raise e output = self._normalize_revision(set_default(raw_rev1, raw_rev2), found_revision, push, get_diff) if output.push.date >= Date.now()-MAX_TODO_AGE: self.todo.add((output.branch, listwrap(output.parents))) self.todo.add((output.branch, listwrap(output.children))) if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None return output
def __init__( self, hg=None, # CONNECT TO hg repo=None, # CONNECTION INFO FOR ES CACHE use_cache=False, # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES timeout=30 * SECOND, kwargs=None, ): if not _hg_branches: _late_imports() if not is_text(repo.index): Log.error("Expecting 'index' parameter") self.repo_locker = Lock() self.moves_locker = Lock() self.todo = mo_threads.Queue("todo for hg daemon", max=DAEMON_QUEUE_SIZE) self.settings = kwargs self.timeout = Duration(timeout) self.last_cache_miss = Date.now() # VERIFY CONNECTIVITY with Explanation("Test connect with hg"): http.head(self.settings.hg.url) set_default(repo, { "type": "revision", "schema": revision_schema, }) kwargs.branches = set_default( { "index": repo.index + "-branches", "type": "branch", }, repo, ) moves = set_default( { "index": repo.index + "-moves", }, repo, ) self.branches = _hg_branches.get_branches(kwargs=kwargs) cluster = elasticsearch.Cluster(kwargs=repo) self.repo = cluster.get_or_create_index(kwargs=repo) self.moves = cluster.get_or_create_index(kwargs=moves) def setup_es(please_stop): with suppress_exception: self.repo.add_alias() with suppress_exception: self.moves.add_alias() with suppress_exception: self.repo.set_refresh_interval(seconds=1) with suppress_exception: self.moves.set_refresh_interval(seconds=1) Thread.run("setup_es", setup_es) Thread.run("hg daemon", self._daemon)
def get_revision(self, revision, locale=None, get_diff=False, get_moves=True, after=None): """ EXPECTING INCOMPLETE revision OBJECT RETURNS revision """ rev = revision.changeset.id if not rev: return Null elif rev == "None": return Null elif revision.branch.name == None: return Null locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE) output = self._get_from_elasticsearch(revision, locale=locale, get_diff=get_diff, get_moves=get_moves, after=after) if output: if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None if not get_moves: output.changeset.moves = None DEBUG and Log.note( "Got hg ({{branch}}, {{locale}}, {{revision}}) from ES", branch=output.branch.name, locale=locale, revision=output.changeset.id, ) if output.push.date >= Date.now() - MAX_TODO_AGE: self.todo.add((output.branch, listwrap(output.parents), None)) self.todo.add((output.branch, listwrap(output.children), None)) if output.push.date: return output # RATE LIMIT CALLS TO HG (CACHE MISSES) next_cache_miss = self.last_cache_miss + ( Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND) self.last_cache_miss = Date.now() if next_cache_miss > self.last_cache_miss: Log.note( "delaying next hg call for {{seconds|round(decimal=1)}} seconds", seconds=next_cache_miss - self.last_cache_miss, ) Till(till=next_cache_miss.unix).wait() found_revision = copy(revision) if isinstance(found_revision.branch, (text, binary_type)): lower_name = found_revision.branch.lower() else: lower_name = found_revision.branch.name.lower() if not lower_name: Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch) b = found_revision.branch = self.branches[(lower_name, locale)] if not b: b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)] if not b: Log.warning( "can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale, ) return Null if Date.now() - Date(b.etl.timestamp) > _hg_branches.OLD_BRANCH: self.branches = _hg_branches.get_branches(kwargs=self.settings) push = self._get_push(found_revision.branch, found_revision.changeset.id) id12 = found_revision.changeset.id[0:12] url1 = found_revision.branch.url.rstrip( "/") + "/json-info?node=" + id12 url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + id12 url3 = (found_revision.branch.url.rstrip("/") + "/json-automationrelevance/" + id12) with Explanation("get revision from {{url}}", url=url1, debug=DEBUG): raw_rev2 = Null automation_details = Null try: raw_rev1 = self._get_raw_json_info(url1, found_revision.branch) raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch) automation_details = self._get_raw_json_rev( url3, found_revision.branch) except Exception as e: if "Hg denies it exists" in e: raw_rev1 = Data(node=revision.changeset.id) else: raise e raw_rev3_changeset = first(r for r in automation_details.changesets if r.node[:12] == id12) if last(automation_details.changesets) != raw_rev3_changeset: Log.note("interesting") output = self._normalize_revision( set_default(raw_rev1, raw_rev2, raw_rev3_changeset), found_revision, push, get_diff, get_moves, ) if output.push.date >= Date.now() - MAX_TODO_AGE: self.todo.add((output.branch, listwrap(output.parents), None)) self.todo.add((output.branch, listwrap(output.children), None)) self.todo.add((output.branch, listwrap(output.backsoutnodes), output.push.date)) if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None if not get_moves: output.changeset.moves = None return output
def _get_from_hg(self, revision, locale=None, get_diff=False, get_moves=True): # RATE LIMIT CALLS TO HG (CACHE MISSES) next_cache_miss = self.last_cache_miss + ( Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND) self.last_cache_miss = Date.now() if next_cache_miss > self.last_cache_miss: Log.note( "delaying next hg call for {{seconds|round(decimal=1)}} seconds", seconds=next_cache_miss - self.last_cache_miss, ) Till(till=next_cache_miss.unix).wait() # CLEAN UP BRANCH NAME found_revision = copy(revision) if isinstance(found_revision.branch, (text, binary_type)): lower_name = found_revision.branch.lower() else: lower_name = found_revision.branch.name.lower() if not lower_name: Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch) b = found_revision.branch = self.branches[(lower_name, locale)] if not b: b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)] if not b: Log.warning( "can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale, ) return Null # REFRESH BRANCHES, IF TOO OLD if Date.now() - Date(b.etl.timestamp) > _hg_branches.OLD_BRANCH: self.branches = _hg_branches.get_branches(kwargs=self.settings) # FIND THE PUSH push = self._get_push(found_revision.branch, found_revision.changeset.id) id12 = found_revision.changeset.id[0:12] base_url = URL(found_revision.branch.url) with Explanation("get revision from {{url}}", url=base_url, debug=DEBUG): raw_rev2 = Null automation_details = Null try: raw_rev1 = self._get_raw_json_info((base_url / "json-info") + {"node": id12}) raw_rev2 = self._get_raw_json_rev(base_url / "json-rev" / id12) automation_details = self._get_raw_json_rev( base_url / "json-automationrelevance" / id12) except Exception as e: if "Hg denies it exists" in e: raw_rev1 = Data(node=revision.changeset.id) else: raise e raw_rev3_changeset = first(r for r in automation_details.changesets if r.node[:12] == id12) if last(automation_details.changesets) != raw_rev3_changeset: Log.note("interesting") output = self._normalize_revision( set_default(raw_rev1, raw_rev2, raw_rev3_changeset), found_revision, push, get_diff, get_moves, ) if output.push.date >= Date.now() - MAX_TODO_AGE: self.todo.extend([ (output.branch, listwrap(output.parents), None), (output.branch, listwrap(output.children), None), ( output.branch, listwrap(output.backsoutnodes), output.push.date, ), ]) if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None if not get_moves: output.changeset.moves = None return output