def retrieve_push_data(self): # Download previous cache. db.download(ADR_CACHE_DB, extract=False) # Extract files from the cache. with open_tar_zst(f"{ADR_CACHE_DB}.zst", "r") as tar: now = round(time.time()) for member in tar: if member.isdir(): os.mkdir(member.name) continue fin = tar.extractfile(member) # If the element expired (the format of the file can be inferred # from cachy's FileStore source code), no need to extract it. content = fin.read(10) expire = int(content) if now >= expire: continue content += fin.read() with open(member.name, "wb") as fout: fout.write(content) self.generate_push_data("label") self.generate_push_data("group") self.upload_adr_cache()
def retrieve_push_data(self): # Download previous cache. cache_path = os.path.splitext(ADR_CACHE_DB)[0] db.download(ADR_CACHE_DB) # Setup adr cache configuration. os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True) with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f: f.write(f"""[adr.cache.stores] file = {{ driver = "file", path = "{os.path.abspath(cache_path)}" }} """) # We'll use the past TRAINING_MONTHS months only for training the model, # but we use 3 months more than that to calculate the failure statistics. self.run_ci_recipes("label", TRAINING_MONTHS + 3) # For groups, we only have 12 weeks in ActiveData. Getting previous data # from task artifacts is slow, so for now we only get what we can get from # ActiveData and we'll see if it's enough to train a satisfying model. self.run_ci_recipes("group", 3) with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar: tar.add(cache_path) zstd_compress("push_data_label.json") zstd_compress("push_data_group.json")
def upload_adr_cache(): cache_path = os.path.splitext(ADR_CACHE_DB)[0] assert os.path.abspath(adr.config["cache"]["stores"]["file"] ["path"]) == os.path.abspath(cache_path) with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar: tar.add(cache_path) db.upload(ADR_CACHE_DB)
def retrieve_push_data(self): # Download previous cache. cache_path = os.path.splitext(ADR_CACHE_DB)[0] if not db.is_old_version(ADR_CACHE_DB): db.download(ADR_CACHE_DB) if os.path.exists(ADR_CACHE_DB): with tarfile.open(ADR_CACHE_DB, "r") as tar: tar.extractall() assert os.path.exists( cache_path), "Decompressed adr cache exists" # Setup adr cache configuration. os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True) with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f: f.write(f"""[adr.cache.stores] file = {{ driver = "file", path = "{os.path.abspath(cache_path)}" }} """) # We'll use the past TRAINING_MONTHS months only for training the model, # but we use 3 months more than that to calculate the failure statistics. subprocess.run( [ "run-adr", "ahal/ci-recipes", "recipe", "-o", os.path.abspath("push_data.json"), "-f", "json", "push_data", "--", "--from", f"today-{TRAINING_MONTHS + 3}month", "--to", "today-3day", "--branch", "autoland", ], check=True, stdout=subprocess. DEVNULL, # Redirect to /dev/null, as the logs are too big otherwise. ) with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar: tar.add(cache_path) zstd_compress("push_data.json")
def retrieve_commits(self, limit): repository.clone(self.repo_dir) if limit: # Mercurial revset supports negative integers starting from tip rev_start = -limit else: db.download(repository.COMMITS_DB, support_files_too=True) rev_start = 0 for commit in repository.get_commits(): rev_start = f"children({commit['node']})" repository.download_commits(self.repo_dir, rev_start=rev_start) logger.info("commit data extracted from repository") zstd_compress("data/commits.json") with open_tar_zst("data/commit_experiences.lmdb.tar.zst") as tar: tar.add("data/commit_experiences.lmdb")
def generate_test_scheduling_history(self, granularity): push_data_path = f"push_data_{granularity}.json" updated = download_check_etag( test_scheduling.PUSH_DATA_URL.format(granularity=granularity) ) if updated: zstd_decompress(push_data_path) os.remove(f"{push_data_path}.zst") assert os.path.exists(push_data_path), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS[granularity] ) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB ) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_LABEL_DB ) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB ) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB ) db.download(test_scheduling_db, support_files_too=True) last_node = None for revs, _ in test_scheduling.get_test_scheduling_history(granularity): last_node = revs[0] def generate_failing_together_probabilities(push_data): # TODO: we should consider the probabilities of `task1 failure -> task2 failure` and # `task2 failure -> task1 failure` separately, as they could be different. count_runs = collections.Counter() count_single_failures = collections.Counter() count_both_failures = collections.Counter() for revisions, tasks, likely_regressions, candidate_regressions in tqdm( push_data ): failures = set(likely_regressions + candidate_regressions) all_tasks = list(set(tasks) | failures) for task1, task2 in itertools.combinations(sorted(all_tasks), 2): count_runs[(task1, task2)] += 1 if task1 in failures: if task2 in failures: count_both_failures[(task1, task2)] += 1 else: count_single_failures[(task1, task2)] += 1 elif task2 in failures: count_single_failures[(task1, task2)] += 1 stats = {} skipped = 0 for couple, run_count in count_runs.most_common(): failure_count = count_both_failures[couple] support = failure_count / run_count if support < 1 / 700: skipped += 1 continue if failure_count != 0: confidence = failure_count / ( count_single_failures[couple] + failure_count ) else: confidence = 0.0 stats[couple] = (support, confidence) logger.info(f"{skipped} couples skipped because their support was too low") logger.info("Redundancies with the highest support and confidence:") for couple, (support, confidence) in sorted( stats.items(), key=lambda k: (-k[1][1], -k[1][0]) )[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) logger.info("Redundancies with the highest confidence and lowest support:") for couple, (support, confidence) in sorted( stats.items(), key=lambda k: (-k[1][1], k[1][0]) )[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) failing_together = test_scheduling.get_failing_together_db() count_redundancies = collections.Counter() for couple, (support, confidence) in stats.items(): if confidence == 1.0: count_redundancies["==100%"] += 1 if confidence > 0.9: count_redundancies[">=90%"] += 1 if confidence > 0.8: count_redundancies[">=80%"] += 1 if confidence > 0.7: count_redundancies[">=70%"] += 1 if confidence < 0.7: continue failing_together[ f"{couple[0]}${couple[1]}".encode("utf-8") ] = struct.pack("ff", support, confidence) for percentage, count in count_redundancies.most_common(): logger.info(f"{count} with {percentage} confidence") test_scheduling.close_failing_together_db() def generate_all_data(): past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures["push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open(push_data_path, "r") as f: push_data = json.load(f) logger.info(f"push data nodes: {len(push_data)}") if granularity == "label": push_data = [ ( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] # In the last 28 pushes, we definitely run all possible runnables. all_runnables_set = set( sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), []) ) # Filter runnables we don't need. all_runnables = filter_runnables( list(all_runnables_set), all_runnables_set, granularity ) all_runnables_set = set(all_runnables_set) logger.info(f"{len(all_runnables_set)} runnables run in the last 28 pushes") push_data = [ ( revisions, filter_runnables(push_tasks, all_runnables_set, granularity), filter_runnables( possible_regressions, all_runnables_set, granularity ), filter_runnables( likely_regressions, all_runnables_set, granularity ), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] if granularity == "label": generate_failing_together_probabilities(push_data) # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False if granularity == "group": update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) for i in tqdm(range(len(push_data))): ( revisions, push_runnables, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map ) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions) ) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity == "group": update_touched_together_gen.send(commits[0]["node"]) result = { "revs": revisions, "data": [], } for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result["data"].append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield result if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) with open_tar_zst(past_failures_db) as tar: tar.add(past_failures_db[: -len(".tar.zst")]) if granularity == "group": with open_tar_zst(touched_together_db) as tar: tar.add(touched_together_db[: -len(".tar.zst")]) if granularity == "label": with open_tar_zst(failing_together_db) as tar: tar.add(failing_together_db[: -len(".tar.zst")])
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL) zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) last_node = None for test_data in test_scheduling.get_test_scheduling_history(): last_node = test_data["revs"][0] def generate_all_data(): past_failures = test_scheduling.get_past_failures() push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open("push_data.json", "r") as f: push_data = json.load(f)[1:] logger.info(f"push data nodes: {len(push_data)}") # In the last 28 pushes, we definitely run all possible tasks. all_tasks_set = set( sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])) # Filter tasks we don't need. all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set) all_tasks_set = set(all_tasks) logger.info( f"{len(all_tasks_set)} tasks run in the last 28 pushes") # Store all tasks in the past_failures DB so it can be used in the evaluation phase. past_failures["all_tasks"] = all_tasks # XXX: Should we recreate the DB from scratch if the previous all_tasks are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_tasks = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for i in tqdm(range(len(push_data))): ( revisions, push_tasks, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_tasks, we'd generate a huge amount of data. # So we consider only the tasks which run in this push, and the possible and likely regressions # from this push. tasks_to_consider = list( set(push_tasks + possible_regressions + likely_regressions)) tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set) if len(tasks_to_consider) == 0: skipped_no_tasks += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, tasks_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: saved_nodes.add(i) data["revs"] = revisions yield data logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling.TEST_SCHEDULING_DB, generate_all_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar: tar.add("data/past_failures.lmdb")
def generate_test_scheduling_history(self, granularity): push_data_path = f"push_data_{granularity}.json" updated = download_check_etag( test_scheduling.PUSH_DATA_URL.format(granularity=granularity)) if updated: zstd_decompress(push_data_path) assert os.path.exists( push_data_path), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS[granularity]) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB) db.download(test_scheduling_db, support_files_too=True) last_node = None for test_data in test_scheduling.get_test_scheduling_history( granularity): last_node = test_data["revs"][0] def generate_all_data(): past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open(push_data_path, "r") as f: push_data = json.load(f) logger.info(f"push data nodes: {len(push_data)}") if granularity == "label": push_data = [( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data] # In the last 28 pushes, we definitely run all possible runnables. all_runnables_set = set( sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), [])) # Filter runnables we don't need. all_runnables = filter_runnables(list(all_runnables_set), all_runnables_set, granularity) all_runnables_set = set(all_runnables_set) logger.info( f"{len(all_runnables_set)} runnables run in the last 28 pushes" ) # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False if granularity == "group": update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) for i in tqdm(range(len(push_data))): ( revisions, push_runnables, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # So we consider only the runnables which run in this push, and the possible and likely regressions # from this push. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions)) runnables_to_consider = filter_runnables( runnables_to_consider, all_runnables_set, granularity) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity == "group": update_touched_together_gen.send(commits[0]["node"]) for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: saved_nodes.add(i) data["revs"] = revisions yield data if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info( f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) with open_tar_zst(past_failures_db) as tar: tar.add(past_failures_db[:-len(".tar.zst")]) if granularity == "group": with open_tar_zst(touched_together_db) as tar: tar.add(touched_together_db[:-len(".tar.zst")])
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["revs"][0] else: last_node = None past_failures = shelve.Shelf( LMDBDict("data/past_failures.lmdb"), protocol=pickle.HIGHEST_PROTOCOL, writeback=True, ) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] key = f"{type_}${task}$" for item in items: full_key = key + item if full_key not in past_failures: cur = past_failures[full_key] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0) else: cur = past_failures[full_key] value = cur[push_num] values_total.append(value) values_prev_7.append(value - cur[push_num - 7]) values_prev_14.append(value - cur[push_num - 14]) values_prev_28.append(value - cur[push_num - 28]) values_prev_56.append(value - cur[push_num - 56]) if is_regression: cur[push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_tasks = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open("push_data.json", "r") as f: push_data = json.load(f)[1:] logger.info(f"push data nodes: {len(push_data)}") # In the last 28 pushes, we definitely run all possible tasks. all_tasks_set = set( sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])) # Filter tasks we don't need. all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set) all_tasks_set = set(all_tasks) logger.info( f"{len(all_tasks_set)} tasks run in the last 28 pushes") # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for i in tqdm(range(len(push_data))): ( revisions, push_tasks, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 20: skipped_too_big_commits += 1 continue # If we considered all_tasks, we'd generate a huge amount of data. # So we consider only the tasks which run in this push, and the possible and likely regressions # from this push. tasks_to_consider = list( set(push_tasks + possible_regressions + likely_regressions)) tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set) if len(tasks_to_consider) == 0: skipped_no_tasks += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) for task in tasks_to_consider: is_regression = (task in possible_regressions or task in likely_regressions) ( total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures, ) = get_and_update_past_failures("all", task, ["all"], push_num, is_regression) ( total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures, ) = get_and_update_past_failures("type", task, merged_commits["types"], push_num, is_regression) ( total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures, ) = get_and_update_past_failures("file", task, merged_commits["files"], push_num, is_regression) ( total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures, ) = get_and_update_past_failures( "directory", task, merged_commits["directories"], push_num, is_regression, ) ( total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures, ) = get_and_update_past_failures( "component", task, merged_commits["components"], push_num, is_regression, ) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield { "revs": revisions, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in possible_regressions, "is_likely_regression": task in likely_regressions, } logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) past_failures["push_num"] = push_num past_failures.close() with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar: tar.add("data/past_failures.lmdb")