def download_commits(repo_dir): hg = hglib.open(repo_dir) commits = hg.log() bug_pattern = re.compile('[\t ]*[Bb][Uu][Gg][\t ]*([0-9]+)') def transform(commit): desc = commit[5].decode('utf-8') bug_id = None bug_id_match = re.search(bug_pattern, desc) if bug_id_match: bug_id = int(bug_id_match.group(1)) return { # 'rev': commit[0].decode('utf-8'), # 'node': commit[1].decode('utf-8'), # 'tags': commit[2].decode('utf-8'), # 'branch': commit[3].decode('utf-8'), # 'author': commit[4].decode('utf-8'), 'desc': desc, # 'date': str(commit[6]), 'bug_id': bug_id, } commits = [transform(commit) for commit in reversed(commits)] db.write(COMMITS_DB, commits)
def get_commits_to_ignore(self): logger.info("Download previous commits to ignore...") db.download(IGNORED_COMMITS_DB) logger.info("Get previously classified commits...") prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB)) logger.info( f"Already found {len(prev_commits_to_ignore)} commits to ignore..." ) # When we already have some analyzed commits, re-analyze the last 3500 to make sure # we didn't miss back-outs that happened since the last analysis. if len(prev_commits_to_ignore) > 0: first_commit_to_reanalyze = ( -3500 if len(prev_commits_to_ignore) >= 3500 else 0) rev_start = "children({})".format( prev_commits_to_ignore[first_commit_to_reanalyze]["rev"]) else: rev_start = 0 with hglib.open(self.mercurial_repo_dir) as hg: revs = repository.get_revs(hg, rev_start) commits = repository.hg_log_multi(self.mercurial_repo_dir, revs) with hglib.open(self.mercurial_repo_dir) as hg: repository.set_commits_to_ignore(hg, self.mercurial_repo_dir, commits) for commit in commits: commit.ignored |= commit.author_email == "*****@*****.**" chosen_commits = set() commits_to_ignore = [] for commit in commits: if commit.ignored or commit.backedoutby: commits_to_ignore.append({ "rev": commit.node, "type": "backedout" if commit.backedoutby else "", }) chosen_commits.add(commit.node) logger.info(f"{len(commits_to_ignore)} new commits to ignore...") for prev_commit in prev_commits_to_ignore[::-1]: if prev_commit["rev"] not in chosen_commits: commits_to_ignore.append(prev_commit) chosen_commits.add(prev_commit["rev"]) logger.info(f"{len(commits_to_ignore)} commits to ignore...") logger.info("...of which {} are backed-out".format( sum(1 for commit in commits_to_ignore if commit["type"] == "backedout"))) db.write(IGNORED_COMMITS_DB, commits_to_ignore) zstd_compress(IGNORED_COMMITS_DB) db.upload(IGNORED_COMMITS_DB)
def test_write_read(tmp_path): db_path = tmp_path / 'prova.json' db.register(db_path, 'https://alink', 1) db.write(db_path, range(1, 8)) assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
def get_commits_to_ignore(self) -> None: assert db.download(repository.COMMITS_DB) ignored = set() commits_to_ignore = [] all_commits = set() annotate_ignore_nodes = { node for node, label in labels.get_labels("annotateignore") if label == "1" } for commit in repository.get_commits(include_no_bug=True, include_backouts=True, include_ignored=True): all_commits.add(commit["node"][:12]) if (commit["ignored"] or commit["backedoutby"] or not commit["bug_id"] or len(commit["backsout"]) > 0 or repository.is_wptsync(commit) or commit["node"] in annotate_ignore_nodes): commits_to_ignore.append({ "rev": commit["node"], "type": "backedout" if commit["backedoutby"] else "", }) ignored.add(commit["node"][:12]) if len(commit["backsout"]) > 0: for backedout in commit["backsout"]: if backedout[:12] in ignored: continue ignored.add(backedout[:12]) commits_to_ignore.append({ "rev": backedout, "type": "backedout" }) logger.info(f"{len(commits_to_ignore)} commits to ignore...") # Skip backed-out commits which aren't in the repository (commits which landed *before* the Mercurial history # started, and backouts which mentioned a bad hash in their message). commits_to_ignore = [ c for c in commits_to_ignore if c["rev"][:12] in all_commits ] logger.info(f"{len(commits_to_ignore)} commits to ignore...") logger.info("...of which {} are backed-out".format( sum(1 for commit in commits_to_ignore if commit["type"] == "backedout"))) db.write(IGNORED_COMMITS_DB, commits_to_ignore) zstd_compress(IGNORED_COMMITS_DB) db.upload(IGNORED_COMMITS_DB)
def download_commits(repo_dir, date_from): hg = hglib.open(repo_dir) first_rev = get_rev(hg, date_from) commits = hg_log(hg, first_rev) commits_num = len(commits) hg.close() # Total previous number of commits by the author. total_commits_by_author = defaultdict(int) # Previous commits by the author, in a 90 days window. commits_by_author = defaultdict(list) global author_experience global author_experience_90_days for commit in commits: author_experience[commit] = total_commits_by_author[commit.author] total_commits_by_author[commit.author] += 1 # Keep only the previous commits from a window of 90 days in the commits_by_author map. cut = None for i, prev_commit in enumerate(commits_by_author[commit.author]): if (commit.date - prev_commit.date).days <= 90: break cut = i if cut is not None: commits_by_author[commit.author] = commits_by_author[ commit.author][cut + 1:] author_experience_90_days[commit] = len( commits_by_author[commit.author]) commits_by_author[commit.author].append(commit) subprocess.run([ os.path.join(repo_dir, 'mach'), 'file-info', 'bugzilla-automation', 'component_data' ], cwd=repo_dir, check=True) global COMPONENTS with open(os.path.join(repo_dir, 'component_data', 'components.json')) as cf: COMPONENTS = json.load(cf) print(f'Mining commits using {multiprocessing.cpu_count()} processes...') with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir, )) as executor: commits = executor.map(_transform, commits, chunksize=64) commits = tqdm(commits, total=commits_num) db.write(COMMITS_DB, commits)
def compress_and_upload() -> None: db.write( SHADOW_SCHEDULER_STATS_DB, (scheduler_stats[push.rev] for push in pushes if push.rev in scheduler_stats), ) utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB) db.upload(SHADOW_SCHEDULER_STATS_DB)
def test_bad_format_compression(tmp_path, db_name): db_path = tmp_path / db_name db.register(db_path, "https://alink") with pytest.raises(AssertionError): db.write(db_path, range(7)) with pytest.raises(AssertionError): db.append(db_path, range(7))
def retrieve_test_info(self, days: int) -> Dict[str, Any]: logger.info("Download previous test info...") db.download(TEST_INFOS_DB) dates = [ datetime.utcnow() - timedelta(days=day) for day in reversed(range(days)) ] logger.info("Get previously gathered test info...") test_infos = { test_info["date"]: test_info for test_info in db.read(TEST_INFOS_DB) } prev_skips = None for date in tqdm(dates): date_str = date.strftime("%Y-%m-%d") # Gather the latest three days again, as the data might have changed. if date_str in test_infos and date < datetime.utcnow() - timedelta( days=3): prev_skips = test_infos[date_str]["skips"] continue test_infos[date_str] = { "date": date_str, "bugs": [{ "id": item["bug_id"], "count": item["bug_count"] } for item in test_scheduling.get_failure_bugs(date, date)], "skips": {}, } try: test_info = test_scheduling.get_test_info(date) for component in test_info["tests"].keys(): test_infos[date_str]["skips"][component] = sum( 1 for test in test_info["tests"][component] if "skip-if" in test) except requests.exceptions.HTTPError: # If we couldn't find a test info artifact for the given date, assume the number of skip-ifs didn't change from the previous day. assert prev_skips is not None test_infos[date_str]["skips"] = prev_skips prev_skips = test_infos[date_str]["skips"] db.write( TEST_INFOS_DB, (test_infos[date.strftime("%Y-%m-%d")] for date in dates if date.strftime("%Y-%m-%d") in test_infos), ) zstd_compress(TEST_INFOS_DB) return test_infos
def download_commits(repo_dir): commits = hg_log(repo_dir) commits_num = len(commits) with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir, )) as executor: commits = executor.map(_transform, commits, chunksize=256) commits = tqdm(commits, total=commits_num) db.write(COMMITS_DB, commits)
def test_bad_format_compression(tmp_path, db_name): db_path = tmp_path / db_name db.register(db_path, "https://alink", 1) with pytest.raises(AssertionError): db.write(db_path, range(7)) with pytest.raises(AssertionError): db.append(db_path, range(7))
def download_commits(repo_dir, date_from): hg = hglib.open(repo_dir) first_rev = get_rev(hg, date_from) commits = hg_log(hg, first_rev) commits_num = len(commits) hg.close() # Total previous number of commits by the author. total_commits_by_author = defaultdict(int) # Previous commits by the author, in a 90 days window. commits_by_author = defaultdict(list) global author_experience global author_experience_90_days for commit in commits: author_experience[commit] = total_commits_by_author[commit.author] # We don't want to consider backed out commits when calculating author/reviewer experience. if not commit.ever_backedout: total_commits_by_author[commit.author] += 1 # Keep only the previous commits from a window of 90 days in the commits_by_author map. cut = None for i, prev_commit in enumerate(commits_by_author[commit.author]): if (commit.date - prev_commit.date).days <= 90: break cut = i if cut is not None: commits_by_author[commit.author] = commits_by_author[ commit.author][cut + 1:] author_experience_90_days[commit] = len( commits_by_author[commit.author]) if not commit.ever_backedout: commits_by_author[commit.author].append(commit) global COMPONENTS r = requests.get( 'https://index.taskcluster.net/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json' ) r.raise_for_status() COMPONENTS = r.json() print(f'Mining commits using {multiprocessing.cpu_count()} processes...') with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir, )) as executor: commits = executor.map(_transform, commits, chunksize=64) commits = tqdm(commits, total=commits_num) db.write(COMMITS_DB, commits)
def test_append(mock_db, db_format, db_compression): db_path = mock_db(db_format, db_compression) db.write(db_path, range(1, 4)) assert list(db.read(db_path)) == [1, 2, 3] db.append(db_path, range(4, 8)) assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
def test_exists_db(tmp_path): db_path = tmp_path / "prova.json" db.register(db_path, "https://alink", 1) assert not db.exists(db_path) db.write(db_path, range(7)) assert db.exists(db_path)
def test_delete(mock_db, db_format, db_compression): db_path = mock_db(db_format, db_compression) db.write(db_path, range(1, 9)) assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7, 8] db.delete(db_path, lambda x: x == 4) assert list(db.read(db_path)) == [1, 2, 3, 5, 6, 7, 8]
def update_commits() -> None: commits = list( get_commits(include_no_bug=True, include_backouts=True, include_ignored=True)) # Add coverage information for previous commits too. # We need to do this because coverage information is sometimes slow to come in. set_commit_coverage(commits) db.write(COMMITS_DB, commits)
def test_unregistered_db(tmp_path): db_path = tmp_path / "prova.json" with pytest.raises(AssertionError): list(db.read(db_path)) with pytest.raises(AssertionError): db.write(db_path, range(7)) with pytest.raises(AssertionError): db.append(db_path, range(7))
def download_commits(repo_dir): commits = hg_log(repo_dir) commits_num = len(commits) print(f'Mining commits using {multiprocessing.cpu_count()} processes...') with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir, )) as executor: commits = executor.map(_transform, commits, chunksize=64) commits = tqdm(commits, total=commits_num) db.write(COMMITS_DB, commits)
def test_append_compressed(tmp_path): db_path = tmp_path / 'prova.json.gz' db.register(db_path, 'https://alink', 1) db.write(db_path, range(1, 4)) assert list(db.read(db_path)) == [1, 2, 3] db.append(db_path, range(4, 8)) assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
def download_commits(repo_dir): hg = hglib.open(repo_dir) commits = hg.log() hg.close() commits = (tuple(commit) for commit in commits) with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir, )) as executor: commits = executor.map(_transform, commits, chunksize=256) db.write(COMMITS_DB, commits)
def test_delete_compressed(tmp_path): db_path = tmp_path / 'prova.json.gz' print(db_path) db.register(db_path, 'https://alink', 1) db.write(db_path, range(1, 9)) assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7, 8] db.delete(db_path, lambda x: x == 4) assert list(db.read(db_path)) == [1, 2, 3, 5, 6, 7, 8]
def update_commits() -> None: if not os.path.exists("data/coverage_mapping.lmdb"): logger.info("Downloading commit->coverage mapping...") download_coverage_mapping() commits = list( get_commits(include_no_bug=True, include_backouts=True, include_ignored=True)) # Add coverage information for previous commits too. # We need to do this because coverage information is sometimes slow to come in. set_commit_coverage(commits) db.write(COMMITS_DB, commits)
def download_commits(repo_dir, date_from): hg = hglib.open(repo_dir) revs = get_revs(hg, date_from) commits_num = len(revs) assert commits_num > 0, 'There should definitely be more than 0 commits, something is wrong' hg.close() processes = multiprocessing.cpu_count() print(f'Mining {commits_num} commits using {processes} processes...') CHUNK_SIZE = 256 revs_groups = [ revs[i:(i + CHUNK_SIZE)] for i in range(0, len(revs), CHUNK_SIZE) ] with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir, )) as executor: commits = executor.map(_hg_log, revs_groups, chunksize=20) commits = tqdm(commits, total=len(revs_groups)) commits = list(itertools.chain.from_iterable(commits)) # Don't analyze backouts. backouts = set(commit.backedoutby for commit in commits if commit.backedoutby != b'') commits = [commit for commit in commits if commit.node not in backouts] # Don't analyze commits that are not linked to a bug. commits = [commit for commit in commits if commit.bug != b''] # Skip commits which are in .hg-annotate-ignore-revs (mostly consisting of very # large and not meaningful formatting changes). with open(os.path.join(repo_dir, '.hg-annotate-ignore-revs'), 'r') as f: ignore_revs = set(l[:40].encode('utf-8') for l in f) commits = [commit for commit in commits if commit.node not in ignore_revs] commits_num = len(commits) print(f'Analyzing {commits_num} patches...') # Total previous number of commits by the author. total_commits_by_author = defaultdict(int) # Previous commits by the author, in a 90 days window. commits_by_author = defaultdict(list) global author_experience global author_experience_90_days for commit in commits: author_experience[commit] = total_commits_by_author[commit.author] # We don't want to consider backed out commits when calculating author/reviewer experience. if not commit.backedoutby: total_commits_by_author[commit.author] += 1 # Keep only the previous commits from a window of 90 days in the commits_by_author map. cut = None for i, prev_commit in enumerate(commits_by_author[commit.author]): if (commit.date - prev_commit.date).days <= 90: break cut = i if cut is not None: commits_by_author[commit.author] = commits_by_author[ commit.author][cut + 1:] author_experience_90_days[commit] = len( commits_by_author[commit.author]) if not commit.backedoutby: commits_by_author[commit.author].append(commit) global COMPONENTS r = requests.get( 'https://index.taskcluster.net/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json' ) r.raise_for_status() COMPONENTS = r.json() print(f'Mining commits using {multiprocessing.cpu_count()} processes...') with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir, )) as executor: commits = executor.map(_transform, commits, chunksize=64) commits = tqdm(commits, total=commits_num) db.write(COMMITS_DB, commits)
def download_commits(repo_dir, date_from): hg = hglib.open(repo_dir) revs = get_revs(hg) assert ( len(revs) > 0 ), "There should definitely be more than 0 commits, something is wrong" hg.close() # Skip commits which are in .hg-annotate-ignore-revs (mostly consisting of very # large and not meaningful formatting changes). with open(os.path.join(repo_dir, ".hg-annotate-ignore-revs"), "rb") as f: ignore_revs = set(l[:40] for l in f) revs = [rev for rev in revs if rev not in ignore_revs] processes = multiprocessing.cpu_count() print(f"Mining {len(revs)} commits using {processes} processes...") CHUNK_SIZE = 256 revs_groups = [revs[i : (i + CHUNK_SIZE)] for i in range(0, len(revs), CHUNK_SIZE)] with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir,) ) as executor: commits = executor.map(_hg_log, revs_groups, chunksize=20) commits = tqdm(commits, total=len(revs_groups)) commits = list(itertools.chain.from_iterable(commits)) # Don't analyze backouts. backouts = set(commit.backedoutby for commit in commits if commit.backedoutby != "") commits = [commit for commit in commits if commit.node not in backouts] # Don't analyze commits that are not linked to a bug. commits = [commit for commit in commits if commit.bug != b""] print("Downloading file->component mapping...") global path_to_component r = requests.get( "https://index.taskcluster.net/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json" ) r.raise_for_status() path_to_component = r.json() path_to_component = { path: "::".join(component) for path, component in path_to_component.items() } calculate_experiences(commits) # Exclude commits outside the range we care about. commits = [commit for commit in commits if commit.pushdate > date_from] commits_num = len(commits) print(f"Mining {commits_num} commits using {processes} processes...") global rs_parsepatch import rs_parsepatch with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir,) ) as executor: commits = executor.map(_transform, commits, chunksize=64) commits = tqdm(commits, total=commits_num) db.write(COMMITS_DB, commits)
def download_commits(repo_dir, date_from): hg = hglib.open(repo_dir) revs = get_revs(hg, date_from) commits_num = len(revs) assert ( commits_num > 0 ), "There should definitely be more than 0 commits, something is wrong" hg.close() processes = multiprocessing.cpu_count() print(f"Mining {commits_num} commits using {processes} processes...") CHUNK_SIZE = 256 revs_groups = [ revs[i:(i + CHUNK_SIZE)] for i in range(0, len(revs), CHUNK_SIZE) ] with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir, )) as executor: commits = executor.map(_hg_log, revs_groups, chunksize=20) commits = tqdm(commits, total=len(revs_groups)) commits = list(itertools.chain.from_iterable(commits)) # Don't analyze backouts. backouts = set(commit.backedoutby for commit in commits if commit.backedoutby != b"") commits = [commit for commit in commits if commit.node not in backouts] # Don't analyze commits that are not linked to a bug. commits = [commit for commit in commits if commit.bug != b""] # Skip commits which are in .hg-annotate-ignore-revs (mostly consisting of very # large and not meaningful formatting changes). with open(os.path.join(repo_dir, ".hg-annotate-ignore-revs"), "r") as f: ignore_revs = set(l[:40].encode("utf-8") for l in f) commits = [commit for commit in commits if commit.node not in ignore_revs] commits_num = len(commits) print(f"Analyzing {commits_num} patches...") # Total previous number of commits by the author. total_commits_by_author = defaultdict(int) # Previous commits by the author, in a 90 days window. commits_by_author = defaultdict(list) global author_experience global author_experience_90_days for commit in commits: author_experience[commit.node] = total_commits_by_author[commit.author] # We don't want to consider backed out commits when calculating author/reviewer experience. if not commit.backedoutby: total_commits_by_author[commit.author] += 1 # Keep only the previous commits from a window of 90 days in the commits_by_author map. cut = None for i, prev_commit in enumerate(commits_by_author[commit.author]): if (commit.date - prev_commit.date).days <= 90: break cut = i if cut is not None: commits_by_author[commit.author] = commits_by_author[ commit.author][cut + 1:] author_experience_90_days[commit.node] = len( commits_by_author[commit.author]) if not commit.backedoutby: commits_by_author[commit.author].append(commit) global path_to_component r = requests.get( "https://index.taskcluster.net/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json" ) r.raise_for_status() path_to_component = r.json() path_to_component = { path: "::".join(component) for path, component in path_to_component.items() } global components_touched_prev global components_touched_prev_90_days global files_touched_prev global files_touched_prev_90_days components_touched = defaultdict(int) files_touched = defaultdict(int) prev_commits_90_days = [] for commit in commits: components = set(path_to_component[path] for path in commit.files if path in path_to_component) for component in components: components_touched_prev[ commit.node] += components_touched[component] components_touched[component] += 1 for path in commit.files: files_touched_prev[commit.node] += files_touched[path] files_touched[path] += 1 if len(commit.file_copies) > 0: for orig, copied in commit.file_copies.items(): if orig in path_to_component and copied in path_to_component: components_touched[path_to_component[ copied]] = components_touched[path_to_component[orig]] files_touched[copied] = files_touched[orig] for i, prev_commit in enumerate(prev_commits_90_days): if (commit.date - prev_commit.date).days <= 90: break cut = i if cut is not None: prev_commits_90_days = prev_commits_90_days[cut + 1:] components_touched_90_days = defaultdict(int) files_touched_90_days = defaultdict(int) for prev_commit in prev_commits_90_days: components_prev = set(path_to_component[path] for path in prev_commit.files if path in path_to_component) for component_prev in components_prev: components_touched_90_days[component_prev] += 1 for path_prev in prev_commit.files: files_touched_90_days[path_prev] += 1 if len(prev_commit.file_copies) > 0: for orig, copied in prev_commit.file_copies.items(): if orig in path_to_component and copied in path_to_component: components_touched_90_days[path_to_component[ copied]] = components_touched_90_days[ path_to_component[orig]] files_touched_90_days[copied] = files_touched_90_days[orig] components_touched_prev_90_days[commit.node] = sum( components_touched_90_days[component] for component in components) files_touched_prev_90_days[commit.node] = sum( files_touched_90_days[path] for path in commit.files) prev_commits_90_days.append(commit) print(f"Mining commits using {multiprocessing.cpu_count()} processes...") with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir, )) as executor: commits = executor.map(_transform, commits, chunksize=64) commits = tqdm(commits, total=commits_num) db.write(COMMITS_DB, commits)
def generate_push_data(self, granularity: str, training_months: int, reretrieve: int) -> None: # We'll use the past training_months months only for training the model, # but we use half training_months months more than that to calculate the # failure statistics. from_months = training_months + math.floor(training_months / 2) # We use the actual date instead of 'today-X' aliases to avoid mozci caching # this query. from_date = datetime.utcnow() - relativedelta(months=from_months) to_date = datetime.utcnow() - relativedelta(days=3) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) if granularity == "label": push_data_db = test_scheduling.PUSH_DATA_LABEL_DB elif granularity == "group": push_data_db = test_scheduling.PUSH_DATA_GROUP_DB elif granularity == "config_group": push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB def cache_key(push: mozci.push.Push) -> str: return f"push_data.{granularity}.{push.rev}" def generate( futures: List[concurrent.futures.Future], ) -> Generator[PushResult, None, None]: nonlocal reretrieve num_cached = 0 num_pushes = len(pushes) for _ in tqdm(range(num_pushes)): push = pushes.pop(0) cached = futures.pop(0).result() semaphore.release() # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we # run. if cached: value, mozci_version = cached # Regenerate results which were generated with an older version of mozci. if reretrieve > 0 and mozci_version != MOZCI_VERSION: cached = None reretrieve -= 1 # Regenerate results which don't contain the fix revision. elif len(value) != 5: cached = None if cached: num_cached += 1 value, mozci_version = cached assert len(value) == 5 yield value else: logger.info( f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( tuple(push.revs), push.backedoutby or push.bustage_fixed_by, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) mozci.config.cache.put( key, (value, MOZCI_VERSION), mozci.config["cache"]["retention"], ) assert len(value) == 5 yield value except mozci.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: traceback.print_exc() logger.info( f"{num_cached} pushes were already cached out of {num_pushes}") semaphore = threading.BoundedSemaphore(256) def retrieve_from_cache(push): semaphore.acquire() return mozci.config.cache.get(cache_key(push)) with concurrent.futures.ThreadPoolExecutor() as executor: futures = [ executor.submit(retrieve_from_cache, push) for push in pushes ] try: db.write(push_data_db, generate(futures)) except Exception: for f in futures: f.cancel() try: semaphore.release() except ValueError: continue raise zstd_compress(push_data_db)
def generate_push_data(self, pushes: List[mozci.push.Push], granularity: str) -> None: # We keep in the cache the fact that we failed to analyze a push for 10 # days, so if we re-run often we don't retry the same pushes many times. MISSING_CACHE_RETENTION = 10 * 24 * 60 from_date = get_from_date(granularity) pushes = [ push for push in pushes if datetime.utcfromtimestamp(push.date) >= from_date ] if granularity == "label": push_data_db = test_scheduling.PUSH_DATA_LABEL_DB elif granularity == "group": push_data_db = test_scheduling.PUSH_DATA_GROUP_DB elif granularity == "config_group": push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB def cache_key(push: mozci.push.Push) -> str: return f"push_data.{granularity}.{push.rev}" def generate(executor) -> Generator[PushResult, None, None]: num_cached = 0 num_pushes = len(pushes) # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we # run. to_regenerate = 1000 semaphore = threading.BoundedSemaphore(256) def retrieve_from_cache(push): semaphore.acquire() return adr.config.cache.get(cache_key(push)) futures = tuple( executor.submit(retrieve_from_cache, push) for push in pushes) for push, future in zip(tqdm(pushes), futures): exc = future.exception() if exc is not None: logger.info(f"Exception {exc} while getting {push.rev}") for f in futures: f.cancel() cached = future.result() semaphore.release() if cached and to_regenerate > 0: value, mozci_version = cached # Regenerate results which were generated when we were not cleaning # up WPT groups. if any(runnable.startswith("/") for runnable in value[1]): cached = None to_regenerate -= 1 """# Regenerate results which were generated with an older version of mozci. elif mozci_version != MOZCI_VERSION and to_regenerate > 0: cached = None to_regenerate -= 1""" if cached is not None: num_cached += 1 if cached: value, mozci_version = cached yield value else: logger.info( f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( push.revs, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) adr.config.cache.put( key, (value, MOZCI_VERSION), adr.config["cache"]["retention"], ) yield value except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) except Exception: traceback.print_exc() adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) logger.info( f"{num_cached} pushes were already cached out of {num_pushes}") with concurrent.futures.ThreadPoolExecutor() as executor: db.write(push_data_db, generate(executor)) zstd_compress(push_data_db)
def generate_push_data(self, granularity: str) -> None: # We'll use the past TRAINING_MONTHS months only for training the model, # but we use half TRAINING_MONTHS months more than that to calculate the # failure statistics. from_months = TRAINING_MONTHS[granularity] + math.floor( TRAINING_MONTHS[granularity] / 2 ) # We use the actual date instead of 'today-X' aliases to avoid adr caching # this query. from_date = datetime.utcnow() - relativedelta(months=from_months) to_date = datetime.utcnow() - relativedelta(days=3) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) if granularity == "label": push_data_db = test_scheduling.PUSH_DATA_LABEL_DB elif granularity == "group": push_data_db = test_scheduling.PUSH_DATA_GROUP_DB elif granularity == "config_group": push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB def cache_key(push: mozci.push.Push) -> str: return f"push_data.{granularity}.{push.rev}" def generate( futures: List[concurrent.futures.Future], ) -> Generator[PushResult, None, None]: num_cached = 0 num_pushes = len(pushes) # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we # run. to_regenerate = int(os.environ.get("OLD_RESULTS_TO_REGENERATE", 0)) for _ in tqdm(range(num_pushes)): push = pushes.pop(0) cached = futures.pop(0).result() semaphore.release() if cached and to_regenerate > 0: value, mozci_version = cached # Regenerate results which were generated when we were not cleaning # up WPT groups. if granularity == "group" and any( runnable.startswith("/") for runnable in value[1] ): cached = None to_regenerate -= 1 # Regenerate results which were generated when we didn't get a correct # configuration for test-verify tasks. elif granularity == "config_group" and any( "test-verify" in runnable[0] for runnable in value[1] ): cached = None to_regenerate -= 1 # Regenerate results which were generated with an older version of mozci. elif mozci_version != MOZCI_VERSION: cached = None to_regenerate -= 1 if cached: num_cached += 1 value, mozci_version = cached yield value else: logger.info(f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( push.revs, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) adr.config.cache.put( key, (value, MOZCI_VERSION), adr.config["cache"]["retention"], ) yield value except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: traceback.print_exc() logger.info(f"{num_cached} pushes were already cached out of {num_pushes}") semaphore = threading.BoundedSemaphore(256) def retrieve_from_cache(push): semaphore.acquire() return adr.config.cache.get(cache_key(push)) with concurrent.futures.ThreadPoolExecutor() as executor: futures = [executor.submit(retrieve_from_cache, push) for push in pushes] try: db.write(push_data_db, generate(futures)) except Exception: for f in futures: f.cancel() try: semaphore.release() except ValueError: continue raise zstd_compress(push_data_db)
def retrieve_test_scheduling_history(self): os.makedirs("data", exist_ok=True) # Download previous cache. cache_path = os.path.abspath("data/adr_cache") if not os.path.exists(cache_path): try: download_check_etag(URL, "adr_cache.tar.xz") with tarfile.open("adr_cache.tar.xz", "r:xz") as tar: tar.extractall() assert os.path.exists( "data/adr_cache"), "Decompressed adr cache exists" except requests.exceptions.HTTPError: logger.info("The adr cache is not available yet") # Setup adr cache configuration. os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True) with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f: f.write(f"""[adr.cache.stores] file = {{ driver = "file", path = "{cache_path}" }} """) # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) # We'll use the past TRAINING_MONTHS months only for training the model, # but we use 3 months more than that to calculate the failure statistics. subprocess.run( [ "run-adr", "ahal/ci-recipes", "recipe", "-o", os.path.abspath("push_data.json"), "-f", "json", "push_data", "--", "--from", f"today-{TRAINING_MONTHS + 3}month", "--to", "today-2day", "--branch", "autoland", ], check=True, stdout=subprocess. DEVNULL, # Redirect to /dev/null, as the logs are too big otherwise. ) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) HISTORICAL_TIMESPAN = 56 past_failures = {} def get_past_failures(task, push_num): if task not in past_failures: past_failures[task] = repository.exp_queue( push_num, HISTORICAL_TIMESPAN + 1, 0) return past_failures[task][push_num] def generate_data(): commits_with_data = set() saved_nodes = set() push_num = 0 for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue total_failures = get_past_failures(task, push_num) past_7_pushes_failures = total_failures - get_past_failures( task, push_num - 7) past_14_pushes_failures = total_failures - get_past_failures( task, push_num - 14) past_28_pushes_failures = total_failures - get_past_failures( task, push_num - 28) past_56_pushes_failures = total_failures - get_past_failures( task, push_num - 56) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } if task in commit_push_data[1] or task in commit_push_data[ 2]: past_failures[task][push_num] = total_failures + 1 push_num += 1 logger.info(f"push data nodes: {len(push_data)}") logger.info( f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.write(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with tarfile.open("data/adr_cache.tar.xz", "w:xz") as tar: tar.add("data/adr_cache")
def download_commits(repo_dir, date_from): hg = hglib.open(repo_dir) revs = get_revs(hg) assert ( len(revs) > 0 ), "There should definitely be more than 0 commits, something is wrong" hg.close() processes = multiprocessing.cpu_count() print(f"Mining {len(revs)} commits using {processes} processes...") CHUNK_SIZE = 256 revs_groups = [revs[i : (i + CHUNK_SIZE)] for i in range(0, len(revs), CHUNK_SIZE)] with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir,) ) as executor: commits = executor.map(_hg_log, revs_groups, chunksize=20) commits = tqdm(commits, total=len(revs_groups)) commits = list(itertools.chain.from_iterable(commits)) print("Downloading file->component mapping...") global path_to_component r = requests.get( "https://index.taskcluster.net/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json" ) r.raise_for_status() path_to_component = r.json() path_to_component = { path: "::".join(component) for path, component in path_to_component.items() } commits_to_ignore = get_commits_to_ignore(repo_dir, commits) print(f"{len(commits_to_ignore)} commits to ignore") calculate_experiences(commits, commits_to_ignore) # Exclude commits to ignore. commits = [commit for commit in commits if commit not in commits_to_ignore] # Exclude commits outside the range we care about. commits = [commit for commit in commits if commit.pushdate > date_from] commits_num = len(commits) print(f"Mining {commits_num} commits using {processes} processes...") global rs_parsepatch import rs_parsepatch with concurrent.futures.ProcessPoolExecutor( initializer=_init, initargs=(repo_dir,) ) as executor: commits = executor.map(_transform, commits, chunksize=64) commits = tqdm(commits, total=commits_num) db.write(COMMITS_DB, commits)
def generate_push_data(self, granularity: str) -> None: # We keep in the cache the fact that we failed to analyze a push for 10 # days, so if we re-run often we don't retry the same pushes many times. MISSING_CACHE_RETENTION = 10 * 24 * 60 # We'll use the past TRAINING_MONTHS months only for training the model, # but we use half TRAINING_MONTHS months more than that to calculate the # failure statistics. from_months = TRAINING_MONTHS[granularity] + math.floor( TRAINING_MONTHS[granularity] / 2 ) # We use the actual date instead of 'today-X' aliases to avoid adr caching # this query. from_date = datetime.utcnow() - relativedelta(months=from_months) to_date = datetime.utcnow() - relativedelta(days=3) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) if granularity == "label": push_data_db = test_scheduling.PUSH_DATA_LABEL_DB elif granularity == "group": push_data_db = test_scheduling.PUSH_DATA_GROUP_DB cache: Dict[mozci.push.Push, Tuple[PushResult, int]] = {} def cache_key(push: mozci.push.Push) -> str: return f"push_data.{granularity}.{push.rev}" with concurrent.futures.ThreadPoolExecutor() as executor: future_to_push = { executor.submit( lambda push: adr.config.cache.get(cache_key(push)), push ): push for push in pushes } for future in tqdm( concurrent.futures.as_completed(future_to_push), total=len(future_to_push), ): push = future_to_push[future] exc = future.exception() if exc is not None: logger.info(f"Exception {exc} while getting {push.rev}") for f in future_to_push.keys(): f.cancel() cache[push] = future.result() # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we # run. """to_regenerate = 0 for push in pushes[::-1]: cached = cache[push] if not cached: continue value, mozci_version = cached if mozci_version != MOZCI_VERSION and to_regenerate < 1000: cache[push] = None to_regenerate += 1""" to_regenerate = 0 for push in pushes[::-1]: cached = cache[push] if not cached: continue if to_regenerate < 1000: del cache[push] adr.config.cache.put(push.push_uuid, {}, 0) to_regenerate += 1 def generate() -> Generator[PushResult, None, None]: num_cached = 0 for push in tqdm(pushes): key = cache_key(push) if push in cache and cache[push] is not None: num_cached += 1 cached = cache[push] if cached: value, mozci_version = cached yield value else: logger.info(f"Analyzing {push.rev} at the {granularity} level...") try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() value = ( push.revs, list(runnables), list(push.get_possible_regressions(granularity)), list(push.get_likely_regressions(granularity)), ) adr.config.cache.put( key, (value, MOZCI_VERSION), adr.config["cache"]["retention"], ) yield value except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) except Exception: traceback.print_exc() adr.config.cache.put(key, (), MISSING_CACHE_RETENTION) logger.info(f"{num_cached} pushes were already cached out of {len(pushes)}") db.write(push_data_db, generate()) zstd_compress(push_data_db)
def generate_push_data( self, pushes: Tuple[mozci.push.Push, ...], granularity: str ) -> None: from_date = get_from_date(granularity) pushes = tuple( push for push in pushes if datetime.utcfromtimestamp(push.date) >= from_date ) if granularity == "label": push_data_db = test_scheduling.PUSH_DATA_LABEL_DB elif granularity == "group": push_data_db = test_scheduling.PUSH_DATA_GROUP_DB elif granularity == "config_group": push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB def cache_key(push: mozci.push.Push) -> str: return f"push_data.{granularity}.{push.rev}" def generate( futures: List[concurrent.futures.Future], ) -> Generator[PushResult, None, None]: num_cached = 0 num_pushes = len(pushes) # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we # run. to_regenerate = 1000 for push in tqdm(pushes): cached = futures.pop(0).result() semaphore.release() if cached and to_regenerate > 0: value, mozci_version = cached # Regenerate results which were generated when we were not cleaning # up WPT groups. if granularity == "group" and any( runnable.startswith("/") for runnable in value[1] ): cached = None to_regenerate -= 1 # Regenerate results which were generated when we didn't get a correct # configuration for test-verify tasks. elif granularity == "config_group" and any( "test-verify" in runnable[0] for runnable in value[1] ): cached = None to_regenerate -= 1 """# Regenerate results which were generated with an older version of mozci. elif mozci_version != MOZCI_VERSION: cached = None to_regenerate -= 1""" if cached: num_cached += 1 value, mozci_version = cached yield value else: logger.info(f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( push.revs, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) adr.config.cache.put( key, (value, MOZCI_VERSION), adr.config["cache"]["retention"], ) yield value except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: traceback.print_exc() logger.info(f"{num_cached} pushes were already cached out of {num_pushes}") semaphore = threading.BoundedSemaphore(256) def retrieve_from_cache(push): semaphore.acquire() return adr.config.cache.get(cache_key(push)) with concurrent.futures.ThreadPoolExecutor() as executor: futures = [executor.submit(retrieve_from_cache, push) for push in pushes] try: db.write(push_data_db, generate(futures)) except Exception: for f in futures: f.cancel() try: semaphore.release() except ValueError: continue raise zstd_compress(push_data_db)