def find_bug_introducing_commits(self, bug_fixing_commits, commits_to_ignore, tokenized): if tokenized: db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB repo_dir = self.tokenized_git_repo_dir else: db_path = BUG_INTRODUCING_COMMITS_DB repo_dir = self.git_repo_dir def git_to_mercurial(rev): if tokenized: return self.tokenized_git_to_mercurial[rev] else: return vcs_map.git_to_mercurial(rev) def mercurial_to_git(rev): if tokenized: return self.mercurial_to_tokenized_git[rev] else: return vcs_map.mercurial_to_git(rev) logger.info("Download previously found bug-introducing commits...") if db.is_old_version(db_path) or not db.exists(db_path): db.download(db_path, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(db_path)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_rev"] for bug_introducing_commit in prev_bug_introducing_commits) logger.info( f"Already classified {len(prev_bug_introducing_commits)} commits..." ) hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines("{}\n".format(mercurial_to_git(commit["rev"])) for commit in commits_to_ignore if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git) logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) if tokenized: bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash" ) def _init(git_repo_dir): thread_local.git = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["rev"])) git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"]) commit = thread_local.git.get_commit(git_fix_revision) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: logger.info("Skipping {} as it is too big".format( bug_fixing_commit["rev"])) return None bug_introducing_modifications = thread_local.git.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")) logger.info("Found {} for {}".format(bug_introducing_modifications, bug_fixing_commit["rev"])) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values( ): for bug_introducing_hash in bug_introducing_hashes: try: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": git_to_mercurial(bug_introducing_hash), }) except Exception as e: # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard"). if not str(e).startswith( "Missing git commit in the VCS map"): raise # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": "", }) return bug_introducing_commits bug_fixing_commits_queue = bug_fixing_commits.copy() with concurrent.futures.ThreadPoolExecutor(initializer=_init, initargs=(repo_dir, ), max_workers=os.cpu_count() + 1) as executor: def results(): num_analyzed = 0 # Analyze up to 500 commits at a time, to avoid the task running out of time. while len( bug_fixing_commits_queue) != 0 and num_analyzed != 500: bug_introducing_commit_futures = [] for _ in range( min(500 - num_analyzed, len(bug_fixing_commits_queue))): bug_introducing_commit_futures.append( executor.submit(find_bic, bug_fixing_commits_queue.pop())) logger.info( f"Analyzing a chunk of {len(bug_introducing_commit_futures)} commits" ) for future in tqdm( concurrent.futures.as_completed( bug_introducing_commit_futures), total=len(bug_introducing_commit_futures), ): result = future.result() if result is not None: num_analyzed += 1 yield from result db.append(db_path, results()) zstd_compress(db_path) return len(bug_fixing_commits_queue) == 0
def generate_test_scheduling_history(self, granularity): push_data_path = f"push_data_{granularity}.json" updated = download_check_etag( test_scheduling.PUSH_DATA_URL.format(granularity=granularity)) if updated: zstd_decompress(push_data_path) assert os.path.exists( push_data_path), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS[granularity]) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB) db.download(test_scheduling_db, support_files_too=True) last_node = None for test_data in test_scheduling.get_test_scheduling_history( granularity): last_node = test_data["revs"][0] def generate_all_data(): past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open(push_data_path, "r") as f: push_data = json.load(f) logger.info(f"push data nodes: {len(push_data)}") if granularity == "label": push_data = [( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data] # In the last 28 pushes, we definitely run all possible runnables. all_runnables_set = set( sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), [])) # Filter runnables we don't need. all_runnables = filter_runnables(list(all_runnables_set), all_runnables_set, granularity) all_runnables_set = set(all_runnables_set) logger.info( f"{len(all_runnables_set)} runnables run in the last 28 pushes" ) # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False if granularity == "group": update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) for i in tqdm(range(len(push_data))): ( revisions, push_runnables, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # So we consider only the runnables which run in this push, and the possible and likely regressions # from this push. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions)) runnables_to_consider = filter_runnables( runnables_to_consider, all_runnables_set, granularity) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity == "group": update_touched_together_gen.send(commits[0]["node"]) for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: saved_nodes.add(i) data["revs"] = revisions yield data if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info( f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) with open_tar_zst(past_failures_db) as tar: tar.add(past_failures_db[:-len(".tar.zst")]) if granularity == "group": with open_tar_zst(touched_together_db) as tar: tar.add(touched_together_db[:-len(".tar.zst")])
def generate_push_data(self, runnable): # We keep in the cache the fact that we failed to analyze a push for 10 # days, so if we re-run often we don't retry the same pushes many times. MISSING_CACHE_RETENTION = 10 * 24 * 60 # We'll use the past TRAINING_MONTHS months only for training the model, # but we use half TRAINING_MONTHS months more than that to calculate the # failure statistics. from_months = TRAINING_MONTHS[runnable] + math.floor( TRAINING_MONTHS[runnable] / 2) pushes = mozci.push.make_push_objects( from_date=f"today-{from_months}month", to_date="today-3day", branch="autoland", ) start_time = time.monotonic() num_cached = 0 push_data = [] def cache_key(push): return f"push_data.{runnable}.{push.rev}" # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we # run. to_regenerate = set() """for push in pushes[::-1]: cached = adr.config.cache.get(cache_key(push)) if not cached: continue value, mozci_version = cached if mozci_version != MOZCI_VERSION and len(to_regenerate) < 1000: to_regenerate.add(value[0][0])""" for push in tqdm(pushes): key = cache_key(push) if adr.config.cache.has(key) and push.revs[0] not in to_regenerate: num_cached += 1 cached = adr.config.cache.get(key) if cached: value, mozci_version = cached push_data.append(value) else: logger.info(f"Analyzing {push.rev} at the {runnable} level...") try: if runnable == "label": runnables = push.task_labels elif runnable == "group": runnables = push.group_summaries.keys() value = [ push.revs, list(runnables), list(push.get_possible_regressions(runnable)), list(push.get_likely_regressions(runnable)), ] push_data.append(value) adr.config.cache.forever(key, (value, MOZCI_VERSION)) except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) except Exception: traceback.print_exc() adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) if time.monotonic() - start_time >= 10800: self.upload_adr_cache() start_time = time.monotonic() logger.info( f"{num_cached} pushes were already cached out of {len(pushes)}") with open(f"push_data_{runnable}.json", "w") as f: json.dump(push_data, f) zstd_compress(f"push_data_{runnable}.json")
def generate_push_data(self, granularity: str, training_months: int, reretrieve: int) -> None: # We'll use the past training_months months only for training the model, # but we use half training_months months more than that to calculate the # failure statistics. from_months = training_months + math.floor(training_months / 2) # We use the actual date instead of 'today-X' aliases to avoid adr caching # this query. from_date = datetime.utcnow() - relativedelta(months=from_months) to_date = datetime.utcnow() - relativedelta(days=3) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) if granularity == "label": push_data_db = test_scheduling.PUSH_DATA_LABEL_DB elif granularity == "group": push_data_db = test_scheduling.PUSH_DATA_GROUP_DB elif granularity == "config_group": push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB def cache_key(push: mozci.push.Push) -> str: return f"push_data.{granularity}.{push.rev}" def generate( futures: List[concurrent.futures.Future], ) -> Generator[PushResult, None, None]: nonlocal reretrieve num_cached = 0 num_pushes = len(pushes) for _ in tqdm(range(num_pushes)): push = pushes.pop(0) cached = futures.pop(0).result() semaphore.release() # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we # run. if cached: value, mozci_version = cached # Regenerate results which were generated with an older version of mozci. if reretrieve > 0 and mozci_version != MOZCI_VERSION: cached = None reretrieve -= 1 # Regenerate results which don't contain the fix revision. elif len(value) != 5: cached = None if cached: num_cached += 1 value, mozci_version = cached assert len(value) == 5 yield value else: logger.info( f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( tuple(push.revs), push.backedoutby or push.bustage_fixed_by, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) adr.config.cache.put( key, (value, MOZCI_VERSION), adr.config["cache"]["retention"], ) assert len(value) == 5 yield value except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: traceback.print_exc() logger.info( f"{num_cached} pushes were already cached out of {num_pushes}") semaphore = threading.BoundedSemaphore(256) def retrieve_from_cache(push): semaphore.acquire() return adr.config.cache.get(cache_key(push)) with concurrent.futures.ThreadPoolExecutor() as executor: futures = [ executor.submit(retrieve_from_cache, push) for push in pushes ] try: db.write(push_data_db, generate(futures)) except Exception: for f in futures: f.cancel() try: semaphore.release() except ValueError: continue raise zstd_compress(push_data_db)
def generate_push_data(self, runnable): # We keep in the cache the fact that we failed to analyze a push for 10 # days, so if we re-run often we don't retry the same pushes many times. MISSING_CACHE_RETENTION = 10 * 24 * 60 # We'll use the past TRAINING_MONTHS months only for training the model, # but we use half TRAINING_MONTHS months more than that to calculate the # failure statistics. from_months = TRAINING_MONTHS[runnable] + math.floor( TRAINING_MONTHS[runnable] / 2) # We use the actual date instead of 'today-X' aliases to avoid adr caching # this query. from_date = datetime.utcnow() - relativedelta(months=from_months) to_date = datetime.utcnow() - relativedelta(days=3) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) num_cached = 0 push_data = [] def cache_key(push): return f"push_data.{runnable}.{push.rev}" # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we # run. to_regenerate = set() """for push in pushes[::-1]: cached = adr.config.cache.get(cache_key(push)) if not cached: continue value, mozci_version = cached if mozci_version != MOZCI_VERSION and len(to_regenerate) < 1000: to_regenerate.add(value[0][0])""" def periodically_upload_adr_cache(): start_time = time.monotonic() while not upload_thread_stop.isSet(): if time.monotonic() - start_time >= 10800: self.upload_adr_cache() start_time = time.monotonic() upload_thread_stop.wait(timeout=7) upload_thread = threading.Thread(target=periodically_upload_adr_cache) upload_thread_stop = threading.Event() upload_thread.start() s3_store = adr.util.cache_stores.S3Store({ "bucket": "communitytc-bugbug", "prefix": "data/adr_cache/", }) s3_store.set_serializer(CompressedPickleSerializer()) for push in tqdm(pushes): key = cache_key(push) if adr.config.cache.has(key) and push.revs[0] not in to_regenerate: num_cached += 1 cached = adr.config.cache.get(key) if cached: s3_store.put(key, cached, adr.config["cache"]["retention"]) value, mozci_version = cached push_data.append(value) else: logger.info(f"Analyzing {push.rev} at the {runnable} level...") try: if runnable == "label": runnables = push.task_labels elif runnable == "group": runnables = push.group_summaries.keys() value = [ push.revs, list(runnables), list(push.get_possible_regressions(runnable)), list(push.get_likely_regressions(runnable)), ] push_data.append(value) adr.config.cache.put(key, (value, MOZCI_VERSION), adr.config["cache"]["retention"]) s3_store.put(key, (value, MOZCI_VERSION), adr.config["cache"]["retention"]) except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) except Exception: traceback.print_exc() adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) upload_thread_stop.set() upload_thread.join() logger.info( f"{num_cached} pushes were already cached out of {len(pushes)}") with open(f"push_data_{runnable}.json", "w") as f: json.dump(push_data, f) zstd_compress(f"push_data_{runnable}.json")
def find_bug_introducing_commits( self, bug_fixing_commits, commits_to_ignore, tokenized ): if tokenized: db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB repo_dir = self.tokenized_git_repo_dir else: db_path = BUG_INTRODUCING_COMMITS_DB repo_dir = self.git_repo_dir def git_to_mercurial(rev): if tokenized: return self.tokenized_git_to_mercurial[rev] else: return vcs_map.git_to_mercurial(rev) def mercurial_to_git(rev): if tokenized: return self.mercurial_to_tokenized_git[rev] else: return vcs_map.mercurial_to_git(rev) logger.info("Download previously found bug-introducing commits...") if db.is_old_version(db_path) or not db.exists(db_path): db.download(db_path, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(db_path)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_rev"] for bug_introducing_commit in prev_bug_introducing_commits ) logger.info( f"Already classified {len(prev_bug_introducing_commits)} commits..." ) hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines( "{}\n".format(mercurial_to_git(commit["rev"])) for commit in commits_to_ignore if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git ) logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) if tokenized: bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash" ) # Analyze up to 500 commits at a time, to avoid the task running out of time. done = True if len(bug_fixing_commits) > 500: bug_fixing_commits = bug_fixing_commits[-500:] done = False with open("done", "w") as f: f.write(str(1 if done else 0)) def _init(git_repo_dir): global GIT_REPO GIT_REPO = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["rev"])) git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"]) commit = GIT_REPO.get_commit(git_fix_revision) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: return [None] bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore") ) logger.info( "Found {} for {}".format( bug_introducing_modifications, bug_fixing_commit["rev"] ) ) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values(): for bug_introducing_hash in bug_introducing_hashes: try: bug_introducing_commits.append( { "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": git_to_mercurial( bug_introducing_hash ), } ) except Exception as e: # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard"). if not str(e).startswith("Missing git commit in the VCS map"): raise # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append( { "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": "", } ) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor( initializer=_init, initargs=(repo_dir,), max_workers=os.cpu_count() + 1 ) as executor: bug_introducing_commits = executor.map(find_bic, bug_fixing_commits) bug_introducing_commits = tqdm( bug_introducing_commits, total=len(bug_fixing_commits) ) bug_introducing_commits = list( itertools.chain.from_iterable(bug_introducing_commits) ) total_results_num = len(bug_introducing_commits) bug_introducing_commits = list(filter(None, bug_introducing_commits)) logger.info( f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big" ) db.append(db_path, bug_introducing_commits) zstd_compress(db_path)
def compress_and_upload(): zstd_compress(db_path) db.upload(db_path)
def go(self): logger.info( "Generate map of bug ID -> bug data for all bugs which were defects" ) bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB)) bug_fixing_commits_nodes = set( bug_fixing_commit["rev"] for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["type"] in ("d", "r") ) all_bug_ids = set( commit["bug_id"] for commit in repository.get_commits() if commit["node"] in bug_fixing_commits_nodes ) bug_map = {} for bug in bugzilla.get_bugs(): if bug["id"] not in all_bug_ids: continue bug_map[bug["id"]] = bug logger.info( "Generate a map from function to the three last bugs which were fixed by touching that function" ) past_bugs_by_function = {} for commit in tqdm(repository.get_commits()): if commit["node"] not in bug_fixing_commits_nodes: continue if commit["bug_id"] not in bug_map: continue bug = bug_map[commit["bug_id"]] bug_str = "Bug {} - {}".format(bug["id"], bug["summary"]) for path, f_group in commit["functions"].items(): if path not in past_bugs_by_function: past_bugs_by_function[path] = {} for f in f_group: if f[0] not in past_bugs_by_function[path]: bugs_deque = deque([bug_str], maxlen=3) else: bugs_deque = past_bugs_by_function[path][f[0]]["bugs"] bugs_deque.append(bug_str) past_bugs_by_function[path][f[0]] = { "start": f[1], "end": f[2], "bugs": bugs_deque, } with open("data/past_bugs_by_function.pickle", "wb") as f: pickle.dump(past_bugs_by_function, f, protocol=pickle.HIGHEST_PROTOCOL) zstd_compress("data/past_bugs_by_function.pickle")
def generate_test_scheduling_history(self): updated = download_check_etag(PUSH_DATA_URL) if updated: zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) last_node = None for test_data in test_scheduling.get_test_scheduling_history(): last_node = test_data["revs"][0] def generate_all_data(): past_failures = test_scheduling.get_past_failures() push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open("push_data.json", "r") as f: push_data = json.load(f)[1:] logger.info(f"push data nodes: {len(push_data)}") # In the last 28 pushes, we definitely run all possible tasks. all_tasks_set = set( sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])) # Filter tasks we don't need. all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set) all_tasks_set = set(all_tasks) logger.info( f"{len(all_tasks_set)} tasks run in the last 28 pushes") # Store all tasks in the past_failures DB so it can be used in the evaluation phase. past_failures["all_tasks"] = all_tasks # XXX: Should we recreate the DB from scratch if the previous all_tasks are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_tasks = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for i in tqdm(range(len(push_data))): ( revisions, push_tasks, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_tasks, we'd generate a huge amount of data. # So we consider only the tasks which run in this push, and the possible and likely regressions # from this push. tasks_to_consider = list( set(push_tasks + possible_regressions + likely_regressions)) tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set) if len(tasks_to_consider) == 0: skipped_no_tasks += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, tasks_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: saved_nodes.add(i) data["revs"] = revisions yield data logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling.TEST_SCHEDULING_DB, generate_all_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar: tar.add("data/past_failures.lmdb")
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json" ), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version(repository.COMMITS_DB) or not db.exists( repository.COMMITS_DB ): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) logger.info(f"push data nodes: {len(push_data)}") HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["rev"] else: last_node = None past_failures = shelve.open( "data/past_failures.shelve", protocol=pickle.HIGHEST_PROTOCOL, writeback=True, ) push_num = past_failures["push_num"] if "push_num" in past_failures else 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] key = f"{type_}${task}$" for item in items: full_key = key + item if full_key not in past_failures: cur = past_failures[full_key] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0 ) else: cur = past_failures[full_key] value = cur[push_num] values_total.append(value) values_prev_7.append(value - cur[push_num - 7]) values_prev_14.append(value - cur[push_num - 14]) values_prev_28.append(value - cur[push_num - 28]) values_prev_56.append(value - cur[push_num - 56]) if is_regression: cur[push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num commits_with_data = set() saved_nodes = set() # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] # Sync DB every 1000 commits, so we cleanup the shelve cache (we'd run OOM otherwise!). if len(commits_with_data) % 1000 == 0: past_failures.sync() if node == last_node: can_start = True continue if not can_start: continue if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue is_regression = ( task in commit_push_data[1] or task in commit_push_data[2] ) total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures( "all", task, ["all"], push_num, is_regression ) total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures( "type", task, commit_data["types"], push_num, is_regression ) total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures( "file", task, commit_data["files"], push_num, is_regression ) total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures( "directory", task, commit_data["directories"], push_num, is_regression, ) total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures( "component", task, commit_data["components"], push_num, is_regression, ) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } # We no longer need the push data for this node, we can free the memory. del push_data[node] push_num += 1 logger.info(f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) past_failures["push_num"] = push_num past_failures.close() zstd_compress("data/past_failures.shelve")
def generate_push_data(self, pushes: List[mozci.push.Push], granularity: str) -> None: # We keep in the cache the fact that we failed to analyze a push for 10 # days, so if we re-run often we don't retry the same pushes many times. MISSING_CACHE_RETENTION = 10 * 24 * 60 from_date = get_from_date(granularity) pushes = [ push for push in pushes if datetime.utcfromtimestamp(push.date) >= from_date ] if granularity == "label": push_data_db = test_scheduling.PUSH_DATA_LABEL_DB elif granularity == "group": push_data_db = test_scheduling.PUSH_DATA_GROUP_DB elif granularity == "config_group": push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB def cache_key(push: mozci.push.Push) -> str: return f"push_data.{granularity}.{push.rev}" def generate(executor) -> Generator[PushResult, None, None]: num_cached = 0 num_pushes = len(pushes) # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we # run. to_regenerate = 1000 semaphore = threading.BoundedSemaphore(256) def retrieve_from_cache(push): semaphore.acquire() return adr.config.cache.get(cache_key(push)) futures = tuple( executor.submit(retrieve_from_cache, push) for push in pushes) for push, future in zip(tqdm(pushes), futures): exc = future.exception() if exc is not None: logger.info(f"Exception {exc} while getting {push.rev}") for f in futures: f.cancel() cached = future.result() semaphore.release() if cached and to_regenerate > 0: value, mozci_version = cached # Regenerate results which were generated when we were not cleaning # up WPT groups. if any(runnable.startswith("/") for runnable in value[1]): cached = None to_regenerate -= 1 """# Regenerate results which were generated with an older version of mozci. elif mozci_version != MOZCI_VERSION and to_regenerate > 0: cached = None to_regenerate -= 1""" if cached is not None: num_cached += 1 if cached: value, mozci_version = cached yield value else: logger.info( f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( push.revs, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) adr.config.cache.put( key, (value, MOZCI_VERSION), adr.config["cache"]["retention"], ) yield value except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) except Exception: traceback.print_exc() adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) logger.info( f"{num_cached} pushes were already cached out of {num_pushes}") with concurrent.futures.ThreadPoolExecutor() as executor: db.write(push_data_db, generate(executor)) zstd_compress(push_data_db)
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["revs"][0] else: last_node = None past_failures = shelve.Shelf( LMDBDict("data/past_failures.lmdb"), protocol=pickle.HIGHEST_PROTOCOL, writeback=True, ) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] key = f"{type_}${task}$" for item in items: full_key = key + item if full_key not in past_failures: cur = past_failures[full_key] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0) else: cur = past_failures[full_key] value = cur[push_num] values_total.append(value) values_prev_7.append(value - cur[push_num - 7]) values_prev_14.append(value - cur[push_num - 14]) values_prev_28.append(value - cur[push_num - 28]) values_prev_56.append(value - cur[push_num - 56]) if is_regression: cur[push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_tasks = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open("push_data.json", "r") as f: push_data = json.load(f)[1:] logger.info(f"push data nodes: {len(push_data)}") # In the last 28 pushes, we definitely run all possible tasks. all_tasks_set = set( sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])) # Filter tasks we don't need. all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set) all_tasks_set = set(all_tasks) logger.info( f"{len(all_tasks_set)} tasks run in the last 28 pushes") # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for i in tqdm(range(len(push_data))): ( revisions, push_tasks, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 20: skipped_too_big_commits += 1 continue # If we considered all_tasks, we'd generate a huge amount of data. # So we consider only the tasks which run in this push, and the possible and likely regressions # from this push. tasks_to_consider = list( set(push_tasks + possible_regressions + likely_regressions)) tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set) if len(tasks_to_consider) == 0: skipped_no_tasks += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) for task in tasks_to_consider: is_regression = (task in possible_regressions or task in likely_regressions) ( total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures, ) = get_and_update_past_failures("all", task, ["all"], push_num, is_regression) ( total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures, ) = get_and_update_past_failures("type", task, merged_commits["types"], push_num, is_regression) ( total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures, ) = get_and_update_past_failures("file", task, merged_commits["files"], push_num, is_regression) ( total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures, ) = get_and_update_past_failures( "directory", task, merged_commits["directories"], push_num, is_regression, ) ( total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures, ) = get_and_update_past_failures( "component", task, merged_commits["components"], push_num, is_regression, ) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield { "revs": revisions, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in possible_regressions, "is_likely_regression": task in likely_regressions, } logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) past_failures["push_num"] = push_num past_failures.close() with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar: tar.add("data/past_failures.lmdb")
def generate_push_data(self, granularity: str) -> None: # We keep in the cache the fact that we failed to analyze a push for 10 # days, so if we re-run often we don't retry the same pushes many times. MISSING_CACHE_RETENTION = 10 * 24 * 60 # We'll use the past TRAINING_MONTHS months only for training the model, # but we use half TRAINING_MONTHS months more than that to calculate the # failure statistics. from_months = TRAINING_MONTHS[granularity] + math.floor( TRAINING_MONTHS[granularity] / 2 ) # We use the actual date instead of 'today-X' aliases to avoid adr caching # this query. from_date = datetime.utcnow() - relativedelta(months=from_months) to_date = datetime.utcnow() - relativedelta(days=3) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) if granularity == "label": push_data_db = test_scheduling.PUSH_DATA_LABEL_DB elif granularity == "group": push_data_db = test_scheduling.PUSH_DATA_GROUP_DB cache: Dict[mozci.push.Push, Tuple[PushResult, int]] = {} def cache_key(push: mozci.push.Push) -> str: return f"push_data.{granularity}.{push.rev}" with concurrent.futures.ThreadPoolExecutor() as executor: future_to_push = { executor.submit( lambda push: adr.config.cache.get(cache_key(push)), push ): push for push in pushes } for future in tqdm( concurrent.futures.as_completed(future_to_push), total=len(future_to_push), ): push = future_to_push[future] exc = future.exception() if exc is not None: logger.info(f"Exception {exc} while getting {push.rev}") for f in future_to_push.keys(): f.cancel() cache[push] = future.result() # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we # run. """to_regenerate = 0 for push in pushes[::-1]: cached = cache[push] if not cached: continue value, mozci_version = cached if mozci_version != MOZCI_VERSION and to_regenerate < 1000: cache[push] = None to_regenerate += 1""" to_regenerate = 0 for push in pushes[::-1]: cached = cache[push] if not cached: continue if to_regenerate < 1000: del cache[push] adr.config.cache.put(push.push_uuid, {}, 0) to_regenerate += 1 def generate() -> Generator[PushResult, None, None]: num_cached = 0 for push in tqdm(pushes): key = cache_key(push) if push in cache and cache[push] is not None: num_cached += 1 cached = cache[push] if cached: value, mozci_version = cached yield value else: logger.info(f"Analyzing {push.rev} at the {granularity} level...") try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() value = ( push.revs, list(runnables), list(push.get_possible_regressions(granularity)), list(push.get_likely_regressions(granularity)), ) adr.config.cache.put( key, (value, MOZCI_VERSION), adr.config["cache"]["retention"], ) yield value except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) except Exception: traceback.print_exc() adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) logger.info(f"{num_cached} pushes were already cached out of {len(pushes)}") db.write(push_data_db, generate()) zstd_compress(push_data_db)
def get_commits_to_ignore(self): logger.info("Download previous commits to ignore...") db.download(IGNORED_COMMITS_DB) logger.info("Get previously classified commits...") prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB)) logger.info(f"Already found {len(prev_commits_to_ignore)} commits to ignore...") # When we already have some analyzed commits, re-analyze the last 3500 to make sure # we didn't miss back-outs that happened since the last analysis. if len(prev_commits_to_ignore) > 0: first_commit_to_reanalyze = ( -3500 if len(prev_commits_to_ignore) >= 3500 else 0 ) rev_start = "children({})".format( prev_commits_to_ignore[first_commit_to_reanalyze]["rev"] ) else: rev_start = 0 with hglib.open(self.mercurial_repo_dir) as hg: revs = repository.get_revs(hg, rev_start) # Drop commits which are not yet present in the mercurial <-> git mapping. while len(revs) > 0: try: vcs_map.mercurial_to_git(revs[-1].decode("ascii")) break except Exception as e: if not str(e).startswith("Missing mercurial commit in the VCS map"): raise revs.pop() commits = repository.hg_log_multi(self.mercurial_repo_dir, revs) repository.set_commits_to_ignore(self.mercurial_repo_dir, commits) chosen_commits = set() commits_to_ignore = [] for commit in commits: if commit.ignored or commit.backedoutby: commits_to_ignore.append( { "rev": commit.node, "type": "backedout" if commit.backedoutby else "", } ) chosen_commits.add(commit.node) logger.info(f"{len(commits_to_ignore)} new commits to ignore...") for prev_commit in prev_commits_to_ignore[::-1]: if prev_commit["rev"] not in chosen_commits: commits_to_ignore.append(prev_commit) chosen_commits.add(prev_commit["rev"]) logger.info(f"{len(commits_to_ignore)} commits to ignore...") logger.info( "...of which {} are backed-out".format( sum(1 for commit in commits_to_ignore if commit["type"] == "backedout") ) ) db.write(IGNORED_COMMITS_DB, commits_to_ignore) zstd_compress(IGNORED_COMMITS_DB) db.upload(IGNORED_COMMITS_DB)
def retrieve_issues( self, owner: str, repo: str, state: str, retrieve_events: bool ) -> None: db.download(github.GITHUB_ISSUES_DB) github.download_issues(owner, repo, state, retrieve_events) zstd_compress(github.GITHUB_ISSUES_DB)
def generate_push_data(self, granularity: str, training_months: int, reretrieve: int) -> None: # We'll use the past training_months months only for training the model, # but we use half training_months months more than that to calculate the # failure statistics. from_months = training_months + math.floor(training_months / 2) # We use the actual date instead of 'today-X' aliases to avoid mozci caching # this query. from_date = datetime.utcnow() - relativedelta(months=from_months) to_date = datetime.utcnow() - relativedelta(days=3) if granularity == "label": push_data_db = test_scheduling.PUSH_DATA_LABEL_DB elif granularity == "group": push_data_db = test_scheduling.PUSH_DATA_GROUP_DB elif granularity == "config_group": push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB def cache_key(push: mozci.push.Push) -> str: return f"push_data.{granularity}.{push.rev}" def generate( progress_bar: tqdm, pushes: list[mozci.push.Push], futures: list[concurrent.futures.Future], ) -> Generator[PushResult, None, None]: nonlocal reretrieve num_cached = 0 num_pushes = len(pushes) num_errors = 0 for push, future in zip(pushes, futures): cached = future.result() # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we # run. if cached: value, mozci_version = cached # Regenerate results which were generated with an older version of mozci. if reretrieve > 0 and mozci_version != MOZCI_VERSION: cached = None reretrieve -= 1 if cached: num_cached += 1 value, mozci_version = cached assert len(value) == 5 if value != "ERROR": yield value else: num_errors += 1 else: logger.info( f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.label_summaries.keys() elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( tuple(push.revs), push.backedoutby or push.bustage_fixed_by, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) mozci.config.cache.put( key, (value, MOZCI_VERSION), mozci.config["cache"]["retention"], ) assert len(value) == 5 yield value except mozci.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: num_errors += 1 traceback.print_exc() mozci.config.cache.put( key, ("ERROR", MOZCI_VERSION), mozci.config["cache"]["retention"], ) progress_bar.update(1) logger.info( f"{num_cached} pushes were already cached out of {num_pushes}") logger.info(f"There were errors in {num_errors} pushes") def retrieve_from_cache(push): return mozci.config.cache.get(cache_key(push)) total_pushes = len( mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", )) with concurrent.futures.ThreadPoolExecutor() as executor: with tqdm(total=total_pushes) as progress_bar: # Run in batches of 7 days to avoid running out of memory (given that mozci pushes # consume a lot of memory, and they all have references to each other through "parent" # and "child" links so they are basically never released while we run this). while from_date < to_date: next_from_date = from_date + relativedelta(days=7) if next_from_date > to_date: next_from_date = to_date pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=next_from_date.strftime("%Y-%m-%d"), branch="autoland", ) futures = [ executor.submit(retrieve_from_cache, push) for push in pushes ] try: db.append(push_data_db, generate(progress_bar, pushes, futures)) except Exception: for f in futures: f.cancel() raise from_date = next_from_date zstd_compress(push_data_db)
def retrieve_bugs(self, limit=None): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() }) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago, six_months_ago) if limit: timespan_ids = timespan_ids[:limit] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[:limit] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=2, months=6) commit_bug_ids = [ commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date ] if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info( f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which caused regressions fixed by commits (useful for the regressor model). regressed_by_bug_ids = sum( [ bug["regressed_by"] for bug in bugzilla.get_bugs() if bug["id"] in commit_bug_ids ], [], ) if limit: regressed_by_bug_ids = regressed_by_bug_ids[-limit:] logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set) bugzilla.download_bugs(all_ids) # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs). regressed_by_bug_ids = sum( [ bug["regressed_by"] for bug in bugzilla.get_bugs() if bug["id"] in commit_bug_ids ], [], ) logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) bugzilla.download_bugs(regressed_by_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) zstd_compress("data/bugs.json")
def go(self) -> None: logger.info( "Generate map of bug ID -> bug data for all bugs which were defects" ) bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB)) bug_fixing_commits_nodes = set( bug_fixing_commit["rev"] for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["type"] in ("d", "r")) logger.info( f"{len(bug_fixing_commits_nodes)} bug-fixing commits to analyze") all_bug_ids = set(commit["bug_id"] for commit in repository.get_commits()) bug_map = { bug["id"]: bug for bug in bugzilla.get_bugs() if bug["id"] in all_bug_ids } logger.info( "Generate a map from files/functions to the bugs which were fixed/introduced by touching them" ) # TODO: Support "moving" past bugs between files when they are renamed and between functions when they are # moved across files. by_dimensions = ["file", "directory", "component"] def dimension_to_field(dimension: str) -> str: return f"{dimension}s" if dimension != "directory" else "directories" past_regressions_by: dict[str, dict[str, list[int]]] = defaultdict( lambda: defaultdict(list)) past_fixed_bugs_by: dict[str, dict[str, list[int]]] = defaultdict( lambda: defaultdict(list)) past_regression_blocked_bugs_by: dict[str, dict[ str, list[int]]] = defaultdict(lambda: defaultdict(list)) past_fixed_bug_blocked_bugs_by: dict[str, dict[ str, list[int]]] = defaultdict(lambda: defaultdict(list)) past_regressions_by_function: dict[str, dict[ str, list[int]]] = defaultdict(lambda: defaultdict(list)) past_fixed_bugs_by_function: dict[str, dict[ str, list[int]]] = defaultdict(lambda: defaultdict(list)) past_regression_blocked_bugs_by_function: dict[str, dict[ str, list[int]]] = defaultdict(lambda: defaultdict(list)) past_fixed_bug_blocked_bugs_by_function: dict[str, dict[ str, list[int]]] = defaultdict(lambda: defaultdict(list)) for commit in tqdm(repository.get_commits()): if commit["bug_id"] not in bug_map: continue if commit["backedoutby"]: continue bug = bug_map[commit["bug_id"]] if len(bug["regressions"]) > 0: for dimension in by_dimensions: for path in commit[dimension_to_field(dimension)]: past_regressions_by[dimension][path].extend( bug_id for bug_id in bug["regressions"] if bug_id in bug_map) past_regression_blocked_bugs_by[dimension][ path].extend(bugzilla.find_blocked_by( bug_map, bug)) for path, f_group in commit["functions"].items(): for f in f_group: past_regressions_by_function[path][f["name"]].extend( bug_id for bug_id in bug["regressions"] if bug_id in bug_map) past_regression_blocked_bugs_by_function[path][ f["name"]].extend( bugzilla.find_blocked_by(bug_map, bug)) if commit["node"] in bug_fixing_commits_nodes: for dimension in by_dimensions: for path in commit[dimension_to_field(dimension)]: past_fixed_bugs_by[dimension][path].append(bug["id"]) past_fixed_bug_blocked_bugs_by[dimension][path].extend( bugzilla.find_blocked_by(bug_map, bug)) for path, f_group in commit["functions"].items(): for f in f_group: past_fixed_bugs_by_function[path][f["name"]].append( bug["id"]) past_fixed_bug_blocked_bugs_by_function[path][ f["name"]].extend( bugzilla.find_blocked_by(bug_map, bug)) def _transform(bug_ids: list[int]) -> list[dict]: seen = set() results = [] for bug_id in bug_ids: if bug_id in seen: continue seen.add(bug_id) bug = bug_map[bug_id] results.append({ "id": bug_id, "summary": bug["summary"], "component": "{}::{}".format(bug["product"], bug["component"]), }) return results def past_bug_ids_to_summaries( past_bugs_by: dict[str, list[int]]) -> dict[str, list[dict]]: return { path: _transform(bug_ids) for path, bug_ids in past_bugs_by.items() } for dimension in by_dimensions: with open(f"data/past_regressions_by_{dimension}.json", "w") as f: json.dump( past_bug_ids_to_summaries(past_regressions_by[dimension]), f) zstd_compress(f"data/past_regressions_by_{dimension}.json") with open(f"data/past_fixed_bugs_by_{dimension}.json", "w") as f: json.dump( past_bug_ids_to_summaries(past_fixed_bugs_by[dimension]), f) zstd_compress(f"data/past_fixed_bugs_by_{dimension}.json") with open(f"data/past_regression_blocked_bugs_by_{dimension}.json", "w") as f: json.dump( past_bug_ids_to_summaries( past_regression_blocked_bugs_by[dimension]), f, ) zstd_compress( f"data/past_regression_blocked_bugs_by_{dimension}.json") with open(f"data/past_fixed_bug_blocked_bugs_by_{dimension}.json", "w") as f: json.dump( past_bug_ids_to_summaries( past_fixed_bug_blocked_bugs_by[dimension]), f, ) zstd_compress( f"data/past_fixed_bug_blocked_bugs_by_{dimension}.json") def past_function_bug_ids_to_summaries( past_bugs: dict[str, dict[str, list[int]]] ) -> dict[str, dict[str, list[dict]]]: return { path: { func: _transform(bug_ids) for func, bug_ids in funcs_bugs.items() } for path, funcs_bugs in past_bugs.items() } with open("data/past_regressions_by_function.json", "w") as f: json.dump( past_function_bug_ids_to_summaries( past_regressions_by_function), f) zstd_compress("data/past_regressions_by_function.json") with open("data/past_fixed_bugs_by_function.json", "w") as f: json.dump( past_function_bug_ids_to_summaries( past_fixed_bugs_by_function), f) zstd_compress("data/past_fixed_bugs_by_function.json") with open("data/past_regression_blocked_bugs_by_function.json", "w") as f: json.dump( past_function_bug_ids_to_summaries( past_regression_blocked_bugs_by_function), f, ) zstd_compress("data/past_regression_blocked_bugs_by_function.json") with open("data/past_fixed_bug_blocked_bugs_by_function.json", "w") as f: json.dump( past_function_bug_ids_to_summaries( past_fixed_bug_blocked_bugs_by_function), f, ) zstd_compress("data/past_fixed_bug_blocked_bugs_by_function.json")
def find_bug_fixing_commits(self): logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download previous classifications...") db.download(BUG_FIXING_COMMITS_DB) logger.info("Get previously classified commits...") prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB)) prev_bug_fixing_commits_nodes = set( bug_fixing_commit["rev"] for bug_fixing_commit in prev_bug_fixing_commits) logger.info( f"Already classified {len(prev_bug_fixing_commits)} commits...") # TODO: Switch to the pure Defect model, as it's better in this case. logger.info("Downloading defect/enhancement/task model...") defect_model = download_and_load_model("defectenhancementtask") logger.info("Downloading regression model...") regression_model = download_and_load_model("regression") start_date = datetime.now() - RELATIVE_START_DATE end_date = datetime.now() - RELATIVE_END_DATE logger.info( f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..." ) commit_map = defaultdict(list) for commit in repository.get_commits(): if commit["node"] in prev_bug_fixing_commits_nodes: continue commit_date = dateutil.parser.parse(commit["pushdate"]) if commit_date < start_date or commit_date > end_date: continue commit_map[commit["bug_id"]].append(commit["node"]) logger.info( f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits" ) assert len(commit_map) > 0 def get_relevant_bugs(): return (bug for bug in bugzilla.get_bugs() if bug["id"] in commit_map) bug_count = sum(1 for bug in get_relevant_bugs()) logger.info( f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing" ) known_defect_labels = defect_model.get_labels() known_regression_labels = regression_model.get_labels() bug_fixing_commits = [] def append_bug_fixing_commits(bug_id, type_): for commit in commit_map[bug_id]: bug_fixing_commits.append({"rev": commit, "type": type_}) for bug in tqdm(get_relevant_bugs(), total=bug_count): # Ignore bugs which are not linked to the commits we care about. if bug["id"] not in commit_map: continue # If we know the label already, we don't need to apply the model. if (bug["id"] in known_regression_labels and known_regression_labels[bug["id"]] == 1): append_bug_fixing_commits(bug["id"], "r") continue if bug["id"] in known_defect_labels: if known_defect_labels[bug["id"]] == "defect": append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") continue if defect_model.classify(bug)[0] == "defect": if regression_model.classify(bug)[0] == 1: append_bug_fixing_commits(bug["id"], "r") else: append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits) zstd_compress(BUG_FIXING_COMMITS_DB) db.upload(BUG_FIXING_COMMITS_DB)
def generate_push_data(self, runnable): def upload_adr_cache(): cache_path = os.path.splitext(ADR_CACHE_DB)[0] assert os.path.abspath( adr.config["cache"]["stores"]["file"]["path"] ) == os.path.abspath(cache_path) with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar: tar.add(cache_path) db.upload(ADR_CACHE_DB) # We'll use the past TRAINING_MONTHS months only for training the model, # but we use half TRAINING_MONTHS months more than that to calculate the # failure statistics. from_months = TRAINING_MONTHS[runnable] + math.floor( TRAINING_MONTHS[runnable] / 2 ) pushes = mozci.push.make_push_objects( from_date=f"today-{from_months}month", to_date="today-3day", branch="autoland", ) start_time = time.monotonic() num_cached = 0 push_data = [] for push in tqdm(pushes): key = f"push_data.{runnable}.{push.rev}" logger.info(f"Analyzing {push.rev} at the {runnable} level...") if adr.config.cache.has(key): num_cached += 1 push_data.append(adr.config.cache.get(key)) else: try: if runnable == "label": runnables = push.task_labels elif runnable == "group": runnables = push.group_summaries.keys() value = [ push.revs, list(runnables), list(push.get_possible_regressions(runnable)), list(push.get_likely_regressions(runnable)), ] push_data.append(value) adr.config.cache.forever(key, value) except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: traceback.print_exc() if time.monotonic() - start_time >= 3600: upload_adr_cache() start_time = time.monotonic() logger.info(f"{num_cached} pushes were already cached out of {len(pushes)}") upload_adr_cache() with open(f"push_data_{runnable}.json", "w") as f: json.dump(push_data, f) zstd_compress(f"push_data_{runnable}.json")
def generate_test_scheduling_history(self, granularity: str, training_months: int) -> None: if granularity != "config_group": # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=training_months) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_LABEL_DB) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB) elif granularity == "config_group": test_scheduling_db = test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_CONFIG_GROUP_DB) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB) push_data_iter, push_data_count, all_runnables = test_scheduling.get_push_data( granularity) if granularity in ("label", "config_group"): test_scheduling.generate_failing_together_probabilities( granularity, push_data_iter(), push_data_count) def generate_all_data() -> Generator[Dict[str, Any], None, None]: past_failures = test_scheduling.get_past_failures( granularity, False) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 commit_map = {} for commit_data in tqdm(repository.get_commits()): commit_map[commit_data["node"]] = commit_data # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 if granularity in ("group", "config_group"): update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) for ( i, ( revisions, fix_revision, push_runnables, possible_regressions, likely_regressions, ), ) in enumerate(tqdm(push_data_iter(), total=push_data_count)): push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions)) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity in ("group", "config_group"): update_touched_together_gen.send(commits[0]["node"]) result_data = [] for data in test_scheduling.generate_data( granularity, past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result_data.append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield { "revs": revisions, "data": result_data, } if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info( f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() # For the config/group granularity, we are only interested in the failing together DB. if granularity != "config_group": db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) create_tar_zst(past_failures_db) if granularity == "group": create_tar_zst(touched_together_db) if granularity in ("label", "config_group"): create_tar_zst(failing_together_db)
def compress_and_upload() -> None: utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB) db.upload(SHADOW_SCHEDULER_STATS_DB)
def retrieve_bugs(self, limit: int = None) -> None: bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = set( bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() })) logger.info(f"Retrieved {len(changed_ids)} IDs.") all_components = bugzilla.get_product_component_count(9999) deleted_component_ids = set( bug["id"] for bug in bugzilla.get_bugs() if "{}::{}".format( bug["product"], bug["component"]) not in all_components) logger.info( f"{len(deleted_component_ids)} bugs belonging to deleted components" ) changed_ids |= deleted_component_ids # Get IDs of bugs between (two years and six months ago) and now. two_years_and_six_months_ago = datetime.utcnow() - relativedelta( years=2, months=6) logger.info(f"Retrieving bug IDs since {two_years_and_six_months_ago}") timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago) if limit: timespan_ids = timespan_ids[-limit:] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[-limit:] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=3) commit_bug_ids = list( set(commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date)) if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info( f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which are regressions, bugs which caused regressions (useful for the regressor model), # and blocked bugs. regression_related_ids: List[int] = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in bugzilla.get_bugs()), [], ))) if limit: regression_related_ids = regression_related_ids[-limit:] logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) # Get IDs of bugs linked to intermittent failures. test_failure_bug_ids = [ item["bug_id"] for item in test_scheduling.get_failure_bugs( two_years_and_six_months_ago, datetime.utcnow()) ] if limit: test_failure_bug_ids = test_failure_bug_ids[-limit:] logger.info(f"{len(test_failure_bug_ids)} bugs about test failures.") all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids + regression_related_ids + test_failure_bug_ids) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set) new_bugs = bugzilla.download_bugs(all_ids) # Get regression_related_ids again (the set could have changed after downloading new bugs). for i in range(7): regression_related_ids = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in new_bugs), [], ))) logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) if limit: regression_related_ids = regression_related_ids[-limit:] # If we got all bugs we needed, break. if set(regression_related_ids).issubset(all_ids): break new_bugs = bugzilla.download_bugs(regression_related_ids) # Try to re-download inconsistent bugs, up to twice. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(2): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) # TODO: Figure out why. missing_history_bug_ids = { bug["id"] for bug in bugzilla.get_bugs() if "history" not in bug } bugzilla.delete_bugs(lambda bug: bug["id"] in missing_history_bug_ids) logger.info( f"Deleted {len(missing_history_bug_ids)} bugs as we couldn't retrieve their history" ) zstd_compress(bugzilla.BUGS_DB)
def retrieve_test_scheduling_history(self): os.makedirs("data", exist_ok=True) # Download previous cache. cache_path = os.path.abspath("data/adr_cache") if not os.path.exists(cache_path): try: download_check_etag(URL, "adr_cache.tar.xz") with tarfile.open("adr_cache.tar.xz", "r:xz") as tar: tar.extractall() assert os.path.exists( "data/adr_cache"), "Decompressed adr cache exists" except requests.exceptions.HTTPError: logger.info("The adr cache is not available yet") # Setup adr cache configuration. os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True) with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f: f.write(f"""[adr.cache.stores] file = {{ driver = "file", path = "{cache_path}" }} """) # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) # We'll use the past TRAINING_MONTHS months only for training the model, # but we use 3 months more than that to calculate the failure statistics. subprocess.run( [ "run-adr", "ahal/ci-recipes", "recipe", "-o", os.path.abspath("push_data.json"), "-f", "json", "push_data", "--", "--from", f"today-{TRAINING_MONTHS + 3}month", "--to", "today-2day", "--branch", "autoland", ], check=True, stdout=subprocess. DEVNULL, # Redirect to /dev/null, as the logs are too big otherwise. ) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) HISTORICAL_TIMESPAN = 56 past_failures = {} def get_past_failures(task, push_num): if task not in past_failures: past_failures[task] = repository.exp_queue( push_num, HISTORICAL_TIMESPAN + 1, 0) return past_failures[task][push_num] def generate_data(): commits_with_data = set() saved_nodes = set() push_num = 0 for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue total_failures = get_past_failures(task, push_num) past_7_pushes_failures = total_failures - get_past_failures( task, push_num - 7) past_14_pushes_failures = total_failures - get_past_failures( task, push_num - 14) past_28_pushes_failures = total_failures - get_past_failures( task, push_num - 28) past_56_pushes_failures = total_failures - get_past_failures( task, push_num - 56) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } if task in commit_push_data[1] or task in commit_push_data[ 2]: past_failures[task][push_num] = total_failures + 1 push_num += 1 logger.info(f"push data nodes: {len(push_data)}") logger.info( f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.write(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with tarfile.open("data/adr_cache.tar.xz", "w:xz") as tar: tar.add("data/adr_cache")
def generate_test_scheduling_history(self, granularity): push_data_path = f"push_data_{granularity}.json" updated = download_check_etag( test_scheduling.PUSH_DATA_URL.format(granularity=granularity)) if updated: zstd_decompress(push_data_path) os.remove(f"{push_data_path}.zst") assert os.path.exists( push_data_path), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS[granularity]) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_LABEL_DB) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB) db.download(test_scheduling_db, support_files_too=True) last_node = None for revs, _ in test_scheduling.get_test_scheduling_history( granularity): last_node = revs[0] def generate_failing_together_probabilities(push_data): # TODO: we should consider the probabilities of `task1 failure -> task2 failure` and # `task2 failure -> task1 failure` separately, as they could be different. count_runs = collections.Counter() count_single_failures = collections.Counter() count_both_failures = collections.Counter() for revisions, tasks, likely_regressions, candidate_regressions in tqdm( push_data): failures = set(likely_regressions + candidate_regressions) all_tasks = list(set(tasks) | failures) for task1, task2 in itertools.combinations( sorted(all_tasks), 2): count_runs[(task1, task2)] += 1 if task1 in failures: if task2 in failures: count_both_failures[(task1, task2)] += 1 else: count_single_failures[(task1, task2)] += 1 elif task2 in failures: count_single_failures[(task1, task2)] += 1 stats = {} skipped = 0 for couple, run_count in count_runs.most_common(): failure_count = count_both_failures[couple] support = failure_count / run_count if support < 1 / 700: skipped += 1 continue if failure_count != 0: confidence = failure_count / ( count_single_failures[couple] + failure_count) else: confidence = 0.0 stats[couple] = (support, confidence) logger.info( f"{skipped} couples skipped because their support was too low") logger.info( "Redundancies with the highest support and confidence:") for couple, (support, confidence) in sorted(stats.items(), key=lambda k: (-k[1][1], -k[1][0]))[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) logger.info( "Redundancies with the highest confidence and lowest support:") for couple, (support, confidence) in sorted(stats.items(), key=lambda k: (-k[1][1], k[1][0]))[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) failing_together = test_scheduling.get_failing_together_db() count_redundancies = collections.Counter() for couple, (support, confidence) in stats.items(): if confidence == 1.0: count_redundancies["==100%"] += 1 if confidence > 0.9: count_redundancies[">=90%"] += 1 if confidence > 0.8: count_redundancies[">=80%"] += 1 if confidence > 0.7: count_redundancies[">=70%"] += 1 if confidence < 0.7: continue failing_together[f"{couple[0]}${couple[1]}".encode( "utf-8")] = struct.pack("ff", support, confidence) for percentage, count in count_redundancies.most_common(): logger.info(f"{count} with {percentage} confidence") test_scheduling.close_failing_together_db() def generate_all_data(): past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open(push_data_path, "r") as f: push_data = json.load(f) logger.info(f"push data nodes: {len(push_data)}") if granularity == "label": push_data = [( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data] # In the last 14 pushes, we definitely run all possible runnables. all_runnables_set = set( sum((push_runnables for _, push_runnables, _, _ in push_data[-14:]), [])) # Filter runnables we don't need. all_runnables = filter_runnables(list(all_runnables_set), all_runnables_set, granularity) all_runnables_set = set(all_runnables_set) logger.info( f"{len(all_runnables_set)} runnables run in the last 14 pushes" ) push_data = [( revisions, filter_runnables(push_tasks, all_runnables_set, granularity), filter_runnables(possible_regressions, all_runnables_set, granularity), filter_runnables(likely_regressions, all_runnables_set, granularity), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data] if granularity == "label": generate_failing_together_probabilities(push_data) # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False if granularity == "group": update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) for i in tqdm(range(len(push_data))): ( revisions, push_runnables, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions)) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity == "group": update_touched_together_gen.send(commits[0]["node"]) result = { "revs": revisions, "data": [], } for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result["data"].append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield result if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info( f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) create_tar_zst(past_failures_db) if granularity == "group": create_tar_zst(touched_together_db) if granularity == "label": create_tar_zst(failing_together_db)
def generate_push_data( self, pushes: Tuple[mozci.push.Push, ...], granularity: str ) -> None: from_date = get_from_date(granularity) pushes = tuple( push for push in pushes if datetime.utcfromtimestamp(push.date) >= from_date ) if granularity == "label": push_data_db = test_scheduling.PUSH_DATA_LABEL_DB elif granularity == "group": push_data_db = test_scheduling.PUSH_DATA_GROUP_DB elif granularity == "config_group": push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB def cache_key(push: mozci.push.Push) -> str: return f"push_data.{granularity}.{push.rev}" def generate( futures: List[concurrent.futures.Future], ) -> Generator[PushResult, None, None]: num_cached = 0 num_pushes = len(pushes) # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we # run. to_regenerate = 1000 for push in tqdm(pushes): cached = futures.pop(0).result() semaphore.release() if cached and to_regenerate > 0: value, mozci_version = cached # Regenerate results which were generated when we were not cleaning # up WPT groups. if granularity == "group" and any( runnable.startswith("/") for runnable in value[1] ): cached = None to_regenerate -= 1 # Regenerate results which were generated when we didn't get a correct # configuration for test-verify tasks. elif granularity == "config_group" and any( "test-verify" in runnable[0] for runnable in value[1] ): cached = None to_regenerate -= 1 """# Regenerate results which were generated with an older version of mozci. elif mozci_version != MOZCI_VERSION: cached = None to_regenerate -= 1""" if cached: num_cached += 1 value, mozci_version = cached yield value else: logger.info(f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( push.revs, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) adr.config.cache.put( key, (value, MOZCI_VERSION), adr.config["cache"]["retention"], ) yield value except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: traceback.print_exc() logger.info(f"{num_cached} pushes were already cached out of {num_pushes}") semaphore = threading.BoundedSemaphore(256) def retrieve_from_cache(push): semaphore.acquire() return adr.config.cache.get(cache_key(push)) with concurrent.futures.ThreadPoolExecutor() as executor: futures = [executor.submit(retrieve_from_cache, push) for push in pushes] try: db.write(push_data_db, generate(futures)) except Exception: for f in futures: f.cancel() try: semaphore.release() except ValueError: continue raise zstd_compress(push_data_db)
def generate_push_data(self, runnable): def upload_adr_cache(): cache_path = os.path.splitext(ADR_CACHE_DB)[0] assert os.path.abspath(adr.config["cache"]["stores"]["file"] ["path"]) == os.path.abspath(cache_path) with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar: tar.add(cache_path) db.upload(ADR_CACHE_DB) # We keep in the cache the fact that we failed to analyze a push for 10 # days, so if we re-run often we don't retry the same pushes many times. MISSING_CACHE_RETENTION = 10 * 24 * 60 # We'll use the past TRAINING_MONTHS months only for training the model, # but we use half TRAINING_MONTHS months more than that to calculate the # failure statistics. from_months = TRAINING_MONTHS[runnable] + math.floor( TRAINING_MONTHS[runnable] / 2) pushes = mozci.push.make_push_objects( from_date=f"today-{from_months}month", to_date="today-3day", branch="autoland", ) start_time = time.monotonic() num_cached = 0 push_data = [] for push in tqdm(pushes): key = f"push_data.{runnable}.{push.rev}" logger.info(f"Analyzing {push.rev} at the {runnable} level...") if adr.config.cache.has(key): num_cached += 1 cached = adr.config.cache.get(key) if cached: # XXX: We have to support items in the cache that were added # before the mozci version was stored. We can drop the if # when all items have been switched over. value = cached[0] if isinstance(cached, tuple) else cached push_data.append(value) else: try: if runnable == "label": runnables = push.task_labels elif runnable == "group": runnables = push.group_summaries.keys() value = [ push.revs, list(runnables), list(push.get_possible_regressions(runnable)), list(push.get_likely_regressions(runnable)), ] push_data.append(value) adr.config.cache.forever(key, (value, MOZCI_VERSION)) except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) except Exception: traceback.print_exc() adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) if time.monotonic() - start_time >= 3600: upload_adr_cache() start_time = time.monotonic() logger.info( f"{num_cached} pushes were already cached out of {len(pushes)}") upload_adr_cache() with open(f"push_data_{runnable}.json", "w") as f: json.dump(push_data, f) zstd_compress(f"push_data_{runnable}.json")
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json" ), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version(repository.COMMITS_DB) or not db.exists( repository.COMMITS_DB ): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["rev"] else: last_node = None try: with open("data/past_failures.pickle", "rb") as f: past_failures, push_num = pickle.load(f) except FileNotFoundError: past_failures = {} push_num = 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): if type_ not in past_failures: past_failures[type_] = {} if task not in past_failures[type_]: past_failures[type_][task] = {} values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] for item in items: if item not in past_failures[type_][task]: past_failures[type_][task][item] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0 ) value = past_failures[type_][task][item][push_num] values_total.append(value) values_prev_7.append( value - past_failures[type_][task][item][push_num - 7] ) values_prev_14.append( value - past_failures[type_][task][item][push_num - 14] ) values_prev_28.append( value - past_failures[type_][task][item][push_num - 28] ) values_prev_56.append( value - past_failures[type_][task][item][push_num - 56] ) if is_regression: past_failures[type_][task][item][push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num commits_with_data = set() saved_nodes = set() # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] if node == last_node: can_start = True continue if not can_start: continue if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue is_regression = ( task in commit_push_data[1] or task in commit_push_data[2] ) total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures( "all", task, ["all"], push_num, is_regression ) total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures( "type", task, commit_data["types"], push_num, is_regression ) total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures( "file", task, commit_data["files"], push_num, is_regression ) total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures( "directory", task, commit_data["directories"], push_num, is_regression, ) total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures( "component", task, commit_data["components"], push_num, is_regression, ) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } push_num += 1 logger.info(f"push data nodes: {len(push_data)}") logger.info(f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with open("data/past_failures.pickle", "wb") as f: pickle.dump((past_failures, push_num), f, protocol=pickle.HIGHEST_PROTOCOL) zstd_compress("data/past_failures.pickle")