def boot_worker(): # Clone autoland logger.info(f"Cloning mozilla autoland in {REPO_DIR}...") repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland") # Download test scheduling DB support files. logger.info("Downloading test scheduling DB support files...") assert (db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) or ALLOW_MISSING_MODELS) assert (db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.PAST_FAILURES_GROUP_DB, ) or ALLOW_MISSING_MODELS) assert (db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) or ALLOW_MISSING_MODELS) # Download commits DB logger.info("Downloading commits DB...") commits_db_downloaded = db.download(repository.COMMITS_DB, support_files_too=True) if not ALLOW_MISSING_MODELS: assert commits_db_downloaded if commits_db_downloaded: # And update it logger.info("Browsing all commits...") for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") commits = repository.download_commits(REPO_DIR, rev_start, use_single_process=True) if len(commits) > 0: # Update the touched together DB. update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) update_touched_together_gen.send(commits[-1]["node"]) try: update_touched_together_gen.send(None) except StopIteration: pass # Preload models bugbug_http.models.preload_models() logger.info("Worker boot done")
def boot_worker(): # Clone autoland def clone_autoland(): logger.info(f"Cloning autoland in {REPO_DIR}...") repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland") def extract_past_failures_label(): try: utils.extract_file( os.path.join("data", test_scheduling.PAST_FAILURES_LABEL_DB)) logger.info("Label-level past failures DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Label-level past failures DB not extracted, but missing models are allowed." ) def extract_failing_together(): try: utils.extract_file( os.path.join("data", test_scheduling.FAILING_TOGETHER_LABEL_DB)) logger.info("Failing together DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Failing together DB not extracted, but missing models are allowed." ) def extract_past_failures_group(): try: utils.extract_file( os.path.join("data", test_scheduling.PAST_FAILURES_GROUP_DB)) logger.info("Group-level past failures DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Group-level past failures DB not extracted, but missing models are allowed." ) def extract_touched_together(): try: utils.extract_file( os.path.join("data", test_scheduling.TOUCHED_TOGETHER_DB)) logger.info("Touched together DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Touched together DB not extracted, but missing models are allowed." ) def extract_commits(): try: utils.extract_file(f"{repository.COMMITS_DB}.zst") logger.info("Commits DB extracted.") return True except FileNotFoundError: logger.info( "Commits DB not extracted, but missing models are allowed.") assert ALLOW_MISSING_MODELS return False def extract_commit_experiences(): try: utils.extract_file( os.path.join("data", repository.COMMIT_EXPERIENCES_DB)) logger.info("Commit experiences DB extracted.") except FileNotFoundError: logger.info( "Commit experiences DB not extracted, but missing models are allowed." ) assert ALLOW_MISSING_MODELS @tenacity.retry( stop=tenacity.stop_after_attempt(7), wait=tenacity.wait_exponential(multiplier=1, min=1, max=8), ) def retrieve_schedulable_tasks(): # Store in a file the list of tasks in the latest autoland push. r = requests.get( "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.autoland.latest.taskgraph.decision/artifacts/public/target-tasks.json" ) r.raise_for_status() with open("known_tasks", "w") as f: f.write("\n".join(r.json())) with concurrent.futures.ThreadPoolExecutor() as executor: clone_autoland_future = executor.submit(clone_autoland) retrieve_schedulable_tasks_future = executor.submit( retrieve_schedulable_tasks) commits_db_extracted = extract_commits() extract_commit_experiences() extract_touched_together() extract_past_failures_label() extract_past_failures_group() extract_failing_together() if commits_db_extracted: # Update the commits DB. logger.info("Browsing all commits...") for commit in repository.get_commits(): pass logger.info("All commits browsed.") # Wait repository to be cloned, as it's required to call repository.download_commits. logger.info("Waiting autoland to be cloned...") clone_autoland_future.result() rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") commits = repository.download_commits(REPO_DIR, rev_start, use_single_process=True) logger.info("Commits DB updated.") logger.info("Updating touched together DB...") if len(commits) > 0: # Update the touched together DB. update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) update_touched_together_gen.send(commits[-1]["node"]) try: update_touched_together_gen.send(None) except StopIteration: pass logger.info("Touched together DB updated.") # Wait list of schedulable tasks to be downloaded and written to disk. retrieve_schedulable_tasks_future.result() logger.info("Worker boot done")
def test_touched_together(monkeypatch): test_scheduling.touched_together = None repository.path_to_component = { "dom/file1.cpp": "Core::DOM", "dom/file2.cpp": "Core::DOM", "layout/file.cpp": "Core::Layout", "dom/tests/manifest1.ini": "Core::DOM", "dom/tests/manifest2.ini": "Core::DOM", } commits = [ repository.Commit( node="commit1", author="author1", desc="commit1", date=datetime(2019, 1, 1), pushdate=datetime(2019, 1, 1), bug_id=123, backsout=[], backedoutby="", author_email="*****@*****.**", reviewers=("reviewer1", "reviewer2"), ).set_files(["dom/file1.cpp", "dom/tests/manifest1.ini"], {}), repository.Commit( node="commitbackedout", author="author1", desc="commitbackedout", date=datetime(2019, 1, 1), pushdate=datetime(2019, 1, 1), bug_id=123, backsout=[], backedoutby="commitbackout", author_email="*****@*****.**", reviewers=("reviewer1", "reviewer2"), ).set_files(["dom/file1.cpp", "dom/tests/manifest1.ini"], {}), repository.Commit( node="commit2", author="author2", desc="commit2", date=datetime(2019, 1, 1), pushdate=datetime(2019, 1, 1), bug_id=123, backsout=[], backedoutby="", author_email="*****@*****.**", reviewers=("reviewer1", ), ).set_files(["dom/file2.cpp", "layout/tests/manifest2.ini"], {}), repository.Commit( node="commit3", author="author1", desc="commit3", date=datetime(2019, 1, 1), pushdate=datetime(2019, 1, 1), bug_id=123, backsout=[], backedoutby="", author_email="*****@*****.**", reviewers=("reviewer2", ), ).set_files(["layout/file.cpp", "dom/tests/manifest1.ini"], {}), repository.Commit( node="commit4", author="author1", desc="commit4", date=datetime(2019, 1, 1), pushdate=datetime(2019, 1, 1), bug_id=123, backsout=[], backedoutby="", author_email="*****@*****.**", reviewers=("reviewer1", "reviewer2"), ).set_files(["dom/file1.cpp", "dom/tests/manifest1.ini"], {}), ] commits = [c.to_dict() for c in commits] def mock_get_commits(): return commits monkeypatch.setattr(repository, "get_commits", mock_get_commits) update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) update_touched_together_gen.send("commit2") assert test_scheduling.get_touched_together("dom/file1.cpp", "dom/tests") == 1 assert test_scheduling.get_touched_together("dom/tests", "dom/file1.cpp") == 1 assert test_scheduling.get_touched_together("dom", "dom/tests/manifest1.ini") == 1 assert test_scheduling.get_touched_together("dom", "dom/tests") == 1 assert test_scheduling.get_touched_together("dom", "dom") == 0 assert test_scheduling.get_touched_together("dom/file2.cpp", "layout/tests") == 1 assert (test_scheduling.get_touched_together( "dom", "layout/tests/manifest2.ini") == 1) assert test_scheduling.get_touched_together("dom", "layout/tests") == 1 assert test_scheduling.get_touched_together("dom/file1.cpp", "dom/file2.cpp") == 0 assert test_scheduling.get_touched_together("layout/file.cpp", "dom/tests") == 0 assert test_scheduling.get_touched_together("layout", "dom/tests") == 0 update_touched_together_gen.send("commit4") assert test_scheduling.get_touched_together("dom/file1.cpp", "dom/tests") == 2 assert test_scheduling.get_touched_together("dom/tests", "dom/file1.cpp") == 2 assert test_scheduling.get_touched_together("dom", "dom/tests/manifest1.ini") == 2 assert test_scheduling.get_touched_together("dom", "dom/tests") == 2 assert test_scheduling.get_touched_together("dom", "dom") == 0 assert test_scheduling.get_touched_together("dom/file2.cpp", "layout/tests") == 1 assert (test_scheduling.get_touched_together( "dom", "layout/tests/manifest2.ini") == 1) assert test_scheduling.get_touched_together("dom", "layout/tests") == 1 assert test_scheduling.get_touched_together("dom/file1.cpp", "dom/file2.cpp") == 0 assert test_scheduling.get_touched_together("layout/file.cpp", "dom/tests") == 1 assert (test_scheduling.get_touched_together( "layout", "dom/tests/manifest1.ini") == 1) assert test_scheduling.get_touched_together("layout", "dom/tests") == 1
def generate_all_data() -> Generator[Dict[str, Any], None, None]: past_failures = test_scheduling.get_past_failures( granularity, False) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 commit_map = {} for commit_data in tqdm(repository.get_commits()): commit_map[commit_data["node"]] = commit_data # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 if granularity in ("group", "config_group"): update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) for ( i, ( revisions, fix_revision, push_runnables, possible_regressions, likely_regressions, ), ) in enumerate(tqdm(push_data_iter(), total=push_data_count)): push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue # Skip wptsync commits, since they are not like normal pushes made by developers. if any(repository.is_wptsync(commit) for commit in commits): continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions)) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity in ("group", "config_group"): update_touched_together_gen.send(commits[0]["node"]) result_data = [] for data in test_scheduling.generate_data( granularity, past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result_data.append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield { "revs": revisions, "data": result_data, } if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info( f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close()
def test_touched_together_restart(monkeypatch: MonkeyPatch) -> None: test_scheduling.touched_together = None repository.path_to_component = { "dom/file1.cpp": "Core::DOM", "dom/file2.cpp": "Core::DOM", "layout/file.cpp": "Core::Layout", "dom/tests/manifest1.ini": "Core::DOM", "dom/tests/manifest2.ini": "Core::DOM", } commits = [ repository.Commit( node="commit1", author="author1", desc="commit1", date=datetime(2019, 1, 1), pushdate=datetime(2019, 1, 1), bug_id=123, backsout=[], backedoutby="", author_email="*****@*****.**", reviewers=("reviewer1", "reviewer2"), ).set_files(["dom/file1.cpp", "dom/tests/manifest1.ini"], {}), repository.Commit( node="commitbackedout", author="author1", desc="commitbackedout", date=datetime(2019, 1, 1), pushdate=datetime(2019, 1, 1), bug_id=123, backsout=[], backedoutby="commitbackout", author_email="*****@*****.**", reviewers=("reviewer1", "reviewer2"), ).set_files(["dom/file1.cpp", "dom/tests/manifest1.ini"], {}), repository.Commit( node="commit2", author="author2", desc="commit2", date=datetime(2019, 1, 1), pushdate=datetime(2019, 1, 1), bug_id=123, backsout=[], backedoutby="", author_email="*****@*****.**", reviewers=("reviewer1", ), ).set_files(["dom/file2.cpp", "layout/tests/manifest2.ini"], {}), repository.Commit( node="commit3", author="author1", desc="commit3", date=datetime(2019, 1, 1), pushdate=datetime(2019, 1, 1), bug_id=123, backsout=[], backedoutby="", author_email="*****@*****.**", reviewers=("reviewer2", ), ).set_files(["layout/file.cpp", "dom/tests/manifest1.ini"], {}), repository.Commit( node="commit4", author="author1", desc="commit4", date=datetime(2019, 1, 1), pushdate=datetime(2019, 1, 1), bug_id=123, backsout=[], backedoutby="", author_email="*****@*****.**", reviewers=("reviewer1", "reviewer2"), ).set_files(["dom/file1.cpp", "dom/tests/manifest1.ini"], {}), ] commits = [c.to_dict() for c in commits] def mock_get_commits() -> List[CommitDict]: return commits monkeypatch.setattr(repository, "get_commits", mock_get_commits) update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) update_touched_together_gen.send(Revision("commit2")) assert test_scheduling.get_touched_together("dom/file1.cpp", "dom/tests") == 1 assert test_scheduling.get_touched_together("dom/tests", "dom/file1.cpp") == 1 assert test_scheduling.get_touched_together("dom", "dom/tests/manifest1.ini") == 1 assert test_scheduling.get_touched_together("dom", "dom/tests") == 1 assert test_scheduling.get_touched_together("dom", "dom") == 0 assert test_scheduling.get_touched_together("dom/file2.cpp", "layout/tests") == 1 assert (test_scheduling.get_touched_together( "dom", "layout/tests/manifest2.ini") == 1) assert test_scheduling.get_touched_together("dom", "layout/tests") == 1 assert test_scheduling.get_touched_together("dom/file1.cpp", "dom/file2.cpp") == 0 assert test_scheduling.get_touched_together("layout/file.cpp", "dom/tests") == 0 assert test_scheduling.get_touched_together("layout", "dom/tests") == 0 try: update_touched_together_gen.send(None) except StopIteration: pass # Ensure we can still read the DB after closing. assert test_scheduling.get_touched_together("dom", "layout/tests") == 1 test_scheduling.close_touched_together_db() update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) update_touched_together_gen.send(Revision("commit4")) assert test_scheduling.get_touched_together("dom/file1.cpp", "dom/tests") == 2 assert test_scheduling.get_touched_together("dom/tests", "dom/file1.cpp") == 2 assert test_scheduling.get_touched_together("dom", "dom/tests/manifest1.ini") == 2 assert test_scheduling.get_touched_together("dom", "dom/tests") == 2 assert test_scheduling.get_touched_together("dom", "dom") == 0 assert test_scheduling.get_touched_together("dom/file2.cpp", "layout/tests") == 1 assert (test_scheduling.get_touched_together( "dom", "layout/tests/manifest2.ini") == 1) assert test_scheduling.get_touched_together("dom", "layout/tests") == 1 assert test_scheduling.get_touched_together("dom/file1.cpp", "dom/file2.cpp") == 0 assert test_scheduling.get_touched_together("layout/file.cpp", "dom/tests") == 1 assert (test_scheduling.get_touched_together( "layout", "dom/tests/manifest1.ini") == 1) assert test_scheduling.get_touched_together("layout", "dom/tests") == 1
def generate_all_data(): past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures["push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open(push_data_path, "r") as f: push_data = json.load(f) logger.info(f"push data nodes: {len(push_data)}") if granularity == "label": push_data = [ ( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] # In the last 28 pushes, we definitely run all possible runnables. all_runnables_set = set( sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), []) ) # Filter runnables we don't need. all_runnables = filter_runnables( list(all_runnables_set), all_runnables_set, granularity ) all_runnables_set = set(all_runnables_set) logger.info(f"{len(all_runnables_set)} runnables run in the last 28 pushes") push_data = [ ( revisions, filter_runnables(push_tasks, all_runnables_set, granularity), filter_runnables( possible_regressions, all_runnables_set, granularity ), filter_runnables( likely_regressions, all_runnables_set, granularity ), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] if granularity == "label": generate_failing_together_probabilities(push_data) # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False if granularity == "group": update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) for i in tqdm(range(len(push_data))): ( revisions, push_runnables, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map ) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions) ) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity == "group": update_touched_together_gen.send(commits[0]["node"]) result = { "revs": revisions, "data": [], } for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result["data"].append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield result if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close()
def boot_worker() -> None: # Clone autoland def clone_autoland() -> None: logger.info(f"Cloning autoland in {REPO_DIR}...") repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland") def extract_past_failures_label() -> None: try: utils.extract_file( os.path.join("data", test_scheduling.PAST_FAILURES_LABEL_DB) ) logger.info("Label-level past failures DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Label-level past failures DB not extracted, but missing models are allowed." ) def extract_failing_together_label() -> None: try: utils.extract_file( os.path.join("data", test_scheduling.FAILING_TOGETHER_LABEL_DB) ) logger.info("Failing together label DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Failing together label DB not extracted, but missing models are allowed." ) def extract_failing_together_config_group() -> None: try: utils.extract_file( os.path.join("data", test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB) ) logger.info("Failing together config/group DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Failing together config/group DB not extracted, but missing models are allowed." ) def extract_past_failures_group() -> None: try: utils.extract_file( os.path.join("data", test_scheduling.PAST_FAILURES_GROUP_DB) ) logger.info("Group-level past failures DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Group-level past failures DB not extracted, but missing models are allowed." ) def extract_touched_together() -> None: try: utils.extract_file( os.path.join("data", test_scheduling.TOUCHED_TOGETHER_DB) ) logger.info("Touched together DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Touched together DB not extracted, but missing models are allowed." ) def extract_commits() -> bool: try: utils.extract_file(f"{repository.COMMITS_DB}.zst") logger.info("Commits DB extracted.") return True except FileNotFoundError: logger.info("Commits DB not extracted, but missing models are allowed.") assert ALLOW_MISSING_MODELS return False def extract_commit_experiences() -> None: try: utils.extract_file(os.path.join("data", repository.COMMIT_EXPERIENCES_DB)) logger.info("Commit experiences DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Commit experiences DB not extracted, but missing models are allowed." ) @tenacity.retry( stop=tenacity.stop_after_attempt(7), wait=tenacity.wait_exponential(multiplier=1, min=1, max=8), ) def retrieve_schedulable_tasks() -> None: r = requests.get( "https://hg.mozilla.org/integration/autoland/json-pushes?version=2&tipsonly=1" ) r.raise_for_status() revs = [ push_obj["changesets"][0] for push_id, push_obj in r.json()["pushes"].items() ] logger.info(f"Retrieving known tasks from {revs}") # Store in a file the list of tasks in the latest autoland pushes. # We use more than one to protect ourselves from broken decision tasks. known_tasks = set() for rev in revs: r = requests.get( f"https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.autoland.revision.{rev}.taskgraph.decision/artifacts/public/target-tasks.json" ) if r.ok: known_tasks.update(r.json()) logger.info(f"Retrieved {len(known_tasks)} tasks") assert len(known_tasks) > 0 with open("known_tasks", "w") as f: f.write("\n".join(known_tasks)) with concurrent.futures.ThreadPoolExecutor() as executor: clone_autoland_future = executor.submit(clone_autoland) retrieve_schedulable_tasks_future = executor.submit(retrieve_schedulable_tasks) commits_db_extracted = extract_commits() extract_commit_experiences() extract_touched_together() extract_past_failures_label() extract_past_failures_group() extract_failing_together_label() extract_failing_together_config_group() if commits_db_extracted: # Update the commits DB. logger.info("Browsing all commits...") nodes = collections.deque( (commit["node"] for commit in repository.get_commits()), maxlen=4096 ) nodes.reverse() logger.info("All commits browsed.") # Wait repository to be cloned, as it's required to call repository.download_commits. logger.info("Waiting autoland to be cloned...") clone_autoland_future.result() with hglib.open(REPO_DIR) as hg: # Try using nodes backwards, in case we have some node that was on central at the time # we mined commits, but is not yet on autoland. for node in nodes: try: revs = repository.get_revs(hg, rev_start=f"children({node})") break except hglib.error.CommandError as e: if b"abort: unknown revision" not in e.err: raise logger.info("Updating commits DB...") commits = repository.download_commits( REPO_DIR, revs=revs, use_single_process=True ) logger.info("Commits DB updated.") logger.info("Updating touched together DB...") if len(commits) > 0: # Update the touched together DB. update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) update_touched_together_gen.send(commits[-1]["node"]) try: update_touched_together_gen.send(None) except StopIteration: pass logger.info("Touched together DB updated.") # Wait list of schedulable tasks to be downloaded and written to disk. retrieve_schedulable_tasks_future.result() logger.info("Worker boot done")
def boot_worker(): # Clone autoland def clone_autoland(): logger.info(f"Cloning autoland in {REPO_DIR}...") repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland") def extract_past_failures_label(): try: utils.extract_file(test_scheduling.PAST_FAILURES_LABEL_DB) except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info("Label-level past failures DB extracted.") def extract_past_failures_group(): try: utils.extract_file(test_scheduling.PAST_FAILURES_GROUP_DB) except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info("Group-level past failures DB extracted.") def extract_touched_together(): try: utils.extract_file(test_scheduling.TOUCHED_TOGETHER_DB) except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info("Touched together DB extracted.") def extract_commits(): try: utils.extract_file(f"{repository.COMMITS_DB}.zst") except FileNotFoundError: assert ALLOW_MISSING_MODELS return False logger.info("Commits DB extracted.") return True def extract_commit_experiences(): try: utils.extract_file(repository.COMMIT_EXPERIENCES_DB) except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info("Commit experiences DB extracted.") with concurrent.futures.ThreadPoolExecutor() as executor: clone_autoland_future = executor.submit(clone_autoland) commits_db_extracted = extract_commits() extract_commit_experiences() extract_touched_together() extract_past_failures_label() extract_past_failures_group() if commits_db_extracted: # Update the commits DB. logger.info("Browsing all commits...") for commit in repository.get_commits(): pass logger.info("All commits browsed.") # Wait repository to be cloned, as it's required to call repository.download_commits. logger.info("Waiting autoland to be cloned...") clone_autoland_future.result() rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") commits = repository.download_commits(REPO_DIR, rev_start, use_single_process=True) logger.info("Commits DB updated.") logger.info("Updating touched together DB...") if len(commits) > 0: # Update the touched together DB. update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) update_touched_together_gen.send(commits[-1]["node"]) try: update_touched_together_gen.send(None) except StopIteration: pass logger.info("Touched together DB updated.") logger.info("Worker boot done")