Exemple #1
0
def download_commits(repo_dir):
    hg = hglib.open(repo_dir)

    commits = hg.log()

    bug_pattern = re.compile('[\t ]*[Bb][Uu][Gg][\t ]*([0-9]+)')

    def transform(commit):
        desc = commit[5].decode('utf-8')

        bug_id = None
        bug_id_match = re.search(bug_pattern, desc)
        if bug_id_match:
            bug_id = int(bug_id_match.group(1))

        return {
            # 'rev': commit[0].decode('utf-8'),
            # 'node': commit[1].decode('utf-8'),
            # 'tags': commit[2].decode('utf-8'),
            # 'branch': commit[3].decode('utf-8'),
            # 'author': commit[4].decode('utf-8'),
            'desc': desc,
            # 'date': str(commit[6]),
            'bug_id': bug_id,
        }

    commits = [transform(commit) for commit in reversed(commits)]

    db.write(COMMITS_DB, commits)
Exemple #2
0
    def get_commits_to_ignore(self):
        logger.info("Download previous commits to ignore...")
        db.download(IGNORED_COMMITS_DB)

        logger.info("Get previously classified commits...")
        prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB))
        logger.info(
            f"Already found {len(prev_commits_to_ignore)} commits to ignore..."
        )

        # When we already have some analyzed commits, re-analyze the last 3500 to make sure
        # we didn't miss back-outs that happened since the last analysis.
        if len(prev_commits_to_ignore) > 0:
            first_commit_to_reanalyze = (
                -3500 if len(prev_commits_to_ignore) >= 3500 else 0)
            rev_start = "children({})".format(
                prev_commits_to_ignore[first_commit_to_reanalyze]["rev"])
        else:
            rev_start = 0

        with hglib.open(self.mercurial_repo_dir) as hg:
            revs = repository.get_revs(hg, rev_start)

        commits = repository.hg_log_multi(self.mercurial_repo_dir, revs)

        with hglib.open(self.mercurial_repo_dir) as hg:
            repository.set_commits_to_ignore(hg, self.mercurial_repo_dir,
                                             commits)

        for commit in commits:
            commit.ignored |= commit.author_email == "*****@*****.**"

        chosen_commits = set()
        commits_to_ignore = []
        for commit in commits:
            if commit.ignored or commit.backedoutby:
                commits_to_ignore.append({
                    "rev":
                    commit.node,
                    "type":
                    "backedout" if commit.backedoutby else "",
                })
                chosen_commits.add(commit.node)

        logger.info(f"{len(commits_to_ignore)} new commits to ignore...")

        for prev_commit in prev_commits_to_ignore[::-1]:
            if prev_commit["rev"] not in chosen_commits:
                commits_to_ignore.append(prev_commit)
                chosen_commits.add(prev_commit["rev"])

        logger.info(f"{len(commits_to_ignore)} commits to ignore...")

        logger.info("...of which {} are backed-out".format(
            sum(1 for commit in commits_to_ignore
                if commit["type"] == "backedout")))

        db.write(IGNORED_COMMITS_DB, commits_to_ignore)
        zstd_compress(IGNORED_COMMITS_DB)
        db.upload(IGNORED_COMMITS_DB)
Exemple #3
0
def test_write_read(tmp_path):
    db_path = tmp_path / 'prova.json'

    db.register(db_path, 'https://alink', 1)

    db.write(db_path, range(1, 8))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
Exemple #4
0
    def get_commits_to_ignore(self) -> None:
        assert db.download(repository.COMMITS_DB)

        ignored = set()
        commits_to_ignore = []
        all_commits = set()

        annotate_ignore_nodes = {
            node
            for node, label in labels.get_labels("annotateignore")
            if label == "1"
        }

        for commit in repository.get_commits(include_no_bug=True,
                                             include_backouts=True,
                                             include_ignored=True):
            all_commits.add(commit["node"][:12])

            if (commit["ignored"] or commit["backedoutby"]
                    or not commit["bug_id"] or len(commit["backsout"]) > 0
                    or repository.is_wptsync(commit)
                    or commit["node"] in annotate_ignore_nodes):
                commits_to_ignore.append({
                    "rev":
                    commit["node"],
                    "type":
                    "backedout" if commit["backedoutby"] else "",
                })
                ignored.add(commit["node"][:12])

            if len(commit["backsout"]) > 0:
                for backedout in commit["backsout"]:
                    if backedout[:12] in ignored:
                        continue
                    ignored.add(backedout[:12])

                    commits_to_ignore.append({
                        "rev": backedout,
                        "type": "backedout"
                    })

        logger.info(f"{len(commits_to_ignore)} commits to ignore...")

        # Skip backed-out commits which aren't in the repository (commits which landed *before* the Mercurial history
        # started, and backouts which mentioned a bad hash in their message).
        commits_to_ignore = [
            c for c in commits_to_ignore if c["rev"][:12] in all_commits
        ]

        logger.info(f"{len(commits_to_ignore)} commits to ignore...")

        logger.info("...of which {} are backed-out".format(
            sum(1 for commit in commits_to_ignore
                if commit["type"] == "backedout")))

        db.write(IGNORED_COMMITS_DB, commits_to_ignore)
        zstd_compress(IGNORED_COMMITS_DB)
        db.upload(IGNORED_COMMITS_DB)
Exemple #5
0
def download_commits(repo_dir, date_from):
    hg = hglib.open(repo_dir)

    first_rev = get_rev(hg, date_from)

    commits = hg_log(hg, first_rev)
    commits_num = len(commits)

    hg.close()

    # Total previous number of commits by the author.
    total_commits_by_author = defaultdict(int)
    # Previous commits by the author, in a 90 days window.
    commits_by_author = defaultdict(list)

    global author_experience
    global author_experience_90_days
    for commit in commits:
        author_experience[commit] = total_commits_by_author[commit.author]
        total_commits_by_author[commit.author] += 1

        # Keep only the previous commits from a window of 90 days in the commits_by_author map.
        cut = None

        for i, prev_commit in enumerate(commits_by_author[commit.author]):
            if (commit.date - prev_commit.date).days <= 90:
                break

            cut = i

        if cut is not None:
            commits_by_author[commit.author] = commits_by_author[
                commit.author][cut + 1:]

        author_experience_90_days[commit] = len(
            commits_by_author[commit.author])

        commits_by_author[commit.author].append(commit)

    subprocess.run([
        os.path.join(repo_dir, 'mach'), 'file-info', 'bugzilla-automation',
        'component_data'
    ],
                   cwd=repo_dir,
                   check=True)

    global COMPONENTS
    with open(os.path.join(repo_dir, 'component_data',
                           'components.json')) as cf:
        COMPONENTS = json.load(cf)

    print(f'Mining commits using {multiprocessing.cpu_count()} processes...')

    with concurrent.futures.ProcessPoolExecutor(
            initializer=_init, initargs=(repo_dir, )) as executor:
        commits = executor.map(_transform, commits, chunksize=64)
        commits = tqdm(commits, total=commits_num)
        db.write(COMMITS_DB, commits)
    def compress_and_upload() -> None:
        db.write(
            SHADOW_SCHEDULER_STATS_DB,
            (scheduler_stats[push.rev]
             for push in pushes if push.rev in scheduler_stats),
        )

        utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB)
        db.upload(SHADOW_SCHEDULER_STATS_DB)
Exemple #7
0
def test_bad_format_compression(tmp_path, db_name):
    db_path = tmp_path / db_name
    db.register(db_path, "https://alink")

    with pytest.raises(AssertionError):
        db.write(db_path, range(7))

    with pytest.raises(AssertionError):
        db.append(db_path, range(7))
    def retrieve_test_info(self, days: int) -> Dict[str, Any]:
        logger.info("Download previous test info...")
        db.download(TEST_INFOS_DB)

        dates = [
            datetime.utcnow() - timedelta(days=day)
            for day in reversed(range(days))
        ]

        logger.info("Get previously gathered test info...")
        test_infos = {
            test_info["date"]: test_info
            for test_info in db.read(TEST_INFOS_DB)
        }

        prev_skips = None
        for date in tqdm(dates):
            date_str = date.strftime("%Y-%m-%d")

            # Gather the latest three days again, as the data might have changed.
            if date_str in test_infos and date < datetime.utcnow() - timedelta(
                    days=3):
                prev_skips = test_infos[date_str]["skips"]
                continue

            test_infos[date_str] = {
                "date":
                date_str,
                "bugs": [{
                    "id": item["bug_id"],
                    "count": item["bug_count"]
                } for item in test_scheduling.get_failure_bugs(date, date)],
                "skips": {},
            }

            try:
                test_info = test_scheduling.get_test_info(date)

                for component in test_info["tests"].keys():
                    test_infos[date_str]["skips"][component] = sum(
                        1 for test in test_info["tests"][component]
                        if "skip-if" in test)
            except requests.exceptions.HTTPError:
                # If we couldn't find a test info artifact for the given date, assume the number of skip-ifs didn't change from the previous day.
                assert prev_skips is not None
                test_infos[date_str]["skips"] = prev_skips

            prev_skips = test_infos[date_str]["skips"]

        db.write(
            TEST_INFOS_DB,
            (test_infos[date.strftime("%Y-%m-%d")]
             for date in dates if date.strftime("%Y-%m-%d") in test_infos),
        )
        zstd_compress(TEST_INFOS_DB)

        return test_infos
Exemple #9
0
def download_commits(repo_dir):
    commits = hg_log(repo_dir)
    commits_num = len(commits)

    with concurrent.futures.ProcessPoolExecutor(
            initializer=_init, initargs=(repo_dir, )) as executor:
        commits = executor.map(_transform, commits, chunksize=256)
        commits = tqdm(commits, total=commits_num)
        db.write(COMMITS_DB, commits)
Exemple #10
0
def test_bad_format_compression(tmp_path, db_name):
    db_path = tmp_path / db_name
    db.register(db_path, "https://alink", 1)

    with pytest.raises(AssertionError):
        db.write(db_path, range(7))

    with pytest.raises(AssertionError):
        db.append(db_path, range(7))
Exemple #11
0
def download_commits(repo_dir, date_from):
    hg = hglib.open(repo_dir)

    first_rev = get_rev(hg, date_from)

    commits = hg_log(hg, first_rev)
    commits_num = len(commits)

    hg.close()

    # Total previous number of commits by the author.
    total_commits_by_author = defaultdict(int)
    # Previous commits by the author, in a 90 days window.
    commits_by_author = defaultdict(list)

    global author_experience
    global author_experience_90_days
    for commit in commits:
        author_experience[commit] = total_commits_by_author[commit.author]
        # We don't want to consider backed out commits when calculating author/reviewer experience.
        if not commit.ever_backedout:
            total_commits_by_author[commit.author] += 1

        # Keep only the previous commits from a window of 90 days in the commits_by_author map.
        cut = None

        for i, prev_commit in enumerate(commits_by_author[commit.author]):
            if (commit.date - prev_commit.date).days <= 90:
                break

            cut = i

        if cut is not None:
            commits_by_author[commit.author] = commits_by_author[
                commit.author][cut + 1:]

        author_experience_90_days[commit] = len(
            commits_by_author[commit.author])

        if not commit.ever_backedout:
            commits_by_author[commit.author].append(commit)

    global COMPONENTS
    r = requests.get(
        'https://index.taskcluster.net/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json'
    )
    r.raise_for_status()
    COMPONENTS = r.json()

    print(f'Mining commits using {multiprocessing.cpu_count()} processes...')

    with concurrent.futures.ProcessPoolExecutor(
            initializer=_init, initargs=(repo_dir, )) as executor:
        commits = executor.map(_transform, commits, chunksize=64)
        commits = tqdm(commits, total=commits_num)
        db.write(COMMITS_DB, commits)
Exemple #12
0
def test_append(mock_db, db_format, db_compression):
    db_path = mock_db(db_format, db_compression)

    db.write(db_path, range(1, 4))

    assert list(db.read(db_path)) == [1, 2, 3]

    db.append(db_path, range(4, 8))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
Exemple #13
0
def test_append(mock_db, db_format, db_compression):
    db_path = mock_db(db_format, db_compression)

    db.write(db_path, range(1, 4))

    assert list(db.read(db_path)) == [1, 2, 3]

    db.append(db_path, range(4, 8))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
Exemple #14
0
def test_exists_db(tmp_path):
    db_path = tmp_path / "prova.json"

    db.register(db_path, "https://alink", 1)

    assert not db.exists(db_path)

    db.write(db_path, range(7))

    assert db.exists(db_path)
Exemple #15
0
def test_delete(mock_db, db_format, db_compression):
    db_path = mock_db(db_format, db_compression)

    db.write(db_path, range(1, 9))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7, 8]

    db.delete(db_path, lambda x: x == 4)

    assert list(db.read(db_path)) == [1, 2, 3, 5, 6, 7, 8]
Exemple #16
0
def test_delete(mock_db, db_format, db_compression):
    db_path = mock_db(db_format, db_compression)

    db.write(db_path, range(1, 9))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7, 8]

    db.delete(db_path, lambda x: x == 4)

    assert list(db.read(db_path)) == [1, 2, 3, 5, 6, 7, 8]
Exemple #17
0
def update_commits() -> None:
    commits = list(
        get_commits(include_no_bug=True,
                    include_backouts=True,
                    include_ignored=True))

    # Add coverage information for previous commits too.
    # We need to do this because coverage information is sometimes slow to come in.
    set_commit_coverage(commits)

    db.write(COMMITS_DB, commits)
Exemple #18
0
def test_unregistered_db(tmp_path):
    db_path = tmp_path / "prova.json"

    with pytest.raises(AssertionError):
        list(db.read(db_path))

    with pytest.raises(AssertionError):
        db.write(db_path, range(7))

    with pytest.raises(AssertionError):
        db.append(db_path, range(7))
Exemple #19
0
def download_commits(repo_dir):
    commits = hg_log(repo_dir)
    commits_num = len(commits)

    print(f'Mining commits using {multiprocessing.cpu_count()} processes...')

    with concurrent.futures.ProcessPoolExecutor(
            initializer=_init, initargs=(repo_dir, )) as executor:
        commits = executor.map(_transform, commits, chunksize=64)
        commits = tqdm(commits, total=commits_num)
        db.write(COMMITS_DB, commits)
Exemple #20
0
def test_unregistered_db(tmp_path):
    db_path = tmp_path / "prova.json"

    with pytest.raises(AssertionError):
        list(db.read(db_path))

    with pytest.raises(AssertionError):
        db.write(db_path, range(7))

    with pytest.raises(AssertionError):
        db.append(db_path, range(7))
Exemple #21
0
def test_append_compressed(tmp_path):
    db_path = tmp_path / 'prova.json.gz'

    db.register(db_path, 'https://alink', 1)

    db.write(db_path, range(1, 4))

    assert list(db.read(db_path)) == [1, 2, 3]

    db.append(db_path, range(4, 8))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
Exemple #22
0
def download_commits(repo_dir):
    hg = hglib.open(repo_dir)

    commits = hg.log()

    hg.close()

    commits = (tuple(commit) for commit in commits)

    with concurrent.futures.ProcessPoolExecutor(
            initializer=_init, initargs=(repo_dir, )) as executor:
        commits = executor.map(_transform, commits, chunksize=256)
        db.write(COMMITS_DB, commits)
Exemple #23
0
def test_delete_compressed(tmp_path):
    db_path = tmp_path / 'prova.json.gz'

    print(db_path)

    db.register(db_path, 'https://alink', 1)

    db.write(db_path, range(1, 9))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7, 8]

    db.delete(db_path, lambda x: x == 4)

    assert list(db.read(db_path)) == [1, 2, 3, 5, 6, 7, 8]
Exemple #24
0
def update_commits() -> None:
    if not os.path.exists("data/coverage_mapping.lmdb"):
        logger.info("Downloading commit->coverage mapping...")
        download_coverage_mapping()

    commits = list(
        get_commits(include_no_bug=True,
                    include_backouts=True,
                    include_ignored=True))

    # Add coverage information for previous commits too.
    # We need to do this because coverage information is sometimes slow to come in.
    set_commit_coverage(commits)

    db.write(COMMITS_DB, commits)
Exemple #25
0
def download_commits(repo_dir, date_from):
    hg = hglib.open(repo_dir)

    revs = get_revs(hg, date_from)

    commits_num = len(revs)

    assert commits_num > 0, 'There should definitely be more than 0 commits, something is wrong'

    hg.close()

    processes = multiprocessing.cpu_count()

    print(f'Mining {commits_num} commits using {processes} processes...')

    CHUNK_SIZE = 256
    revs_groups = [
        revs[i:(i + CHUNK_SIZE)] for i in range(0, len(revs), CHUNK_SIZE)
    ]

    with concurrent.futures.ProcessPoolExecutor(
            initializer=_init, initargs=(repo_dir, )) as executor:
        commits = executor.map(_hg_log, revs_groups, chunksize=20)
        commits = tqdm(commits, total=len(revs_groups))
        commits = list(itertools.chain.from_iterable(commits))

    # Don't analyze backouts.
    backouts = set(commit.backedoutby for commit in commits
                   if commit.backedoutby != b'')
    commits = [commit for commit in commits if commit.node not in backouts]

    # Don't analyze commits that are not linked to a bug.
    commits = [commit for commit in commits if commit.bug != b'']

    # Skip commits which are in .hg-annotate-ignore-revs (mostly consisting of very
    # large and not meaningful formatting changes).
    with open(os.path.join(repo_dir, '.hg-annotate-ignore-revs'), 'r') as f:
        ignore_revs = set(l[:40].encode('utf-8') for l in f)

    commits = [commit for commit in commits if commit.node not in ignore_revs]

    commits_num = len(commits)

    print(f'Analyzing {commits_num} patches...')

    # Total previous number of commits by the author.
    total_commits_by_author = defaultdict(int)
    # Previous commits by the author, in a 90 days window.
    commits_by_author = defaultdict(list)

    global author_experience
    global author_experience_90_days
    for commit in commits:
        author_experience[commit] = total_commits_by_author[commit.author]
        # We don't want to consider backed out commits when calculating author/reviewer experience.
        if not commit.backedoutby:
            total_commits_by_author[commit.author] += 1

        # Keep only the previous commits from a window of 90 days in the commits_by_author map.
        cut = None

        for i, prev_commit in enumerate(commits_by_author[commit.author]):
            if (commit.date - prev_commit.date).days <= 90:
                break

            cut = i

        if cut is not None:
            commits_by_author[commit.author] = commits_by_author[
                commit.author][cut + 1:]

        author_experience_90_days[commit] = len(
            commits_by_author[commit.author])

        if not commit.backedoutby:
            commits_by_author[commit.author].append(commit)

    global COMPONENTS
    r = requests.get(
        'https://index.taskcluster.net/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json'
    )
    r.raise_for_status()
    COMPONENTS = r.json()

    print(f'Mining commits using {multiprocessing.cpu_count()} processes...')

    with concurrent.futures.ProcessPoolExecutor(
            initializer=_init, initargs=(repo_dir, )) as executor:
        commits = executor.map(_transform, commits, chunksize=64)
        commits = tqdm(commits, total=commits_num)
        db.write(COMMITS_DB, commits)
Exemple #26
0
def download_commits(repo_dir, date_from):
    hg = hglib.open(repo_dir)

    revs = get_revs(hg)

    assert (
        len(revs) > 0
    ), "There should definitely be more than 0 commits, something is wrong"

    hg.close()

    # Skip commits which are in .hg-annotate-ignore-revs (mostly consisting of very
    # large and not meaningful formatting changes).
    with open(os.path.join(repo_dir, ".hg-annotate-ignore-revs"), "rb") as f:
        ignore_revs = set(l[:40] for l in f)

    revs = [rev for rev in revs if rev not in ignore_revs]

    processes = multiprocessing.cpu_count()

    print(f"Mining {len(revs)} commits using {processes} processes...")

    CHUNK_SIZE = 256
    revs_groups = [revs[i : (i + CHUNK_SIZE)] for i in range(0, len(revs), CHUNK_SIZE)]

    with concurrent.futures.ProcessPoolExecutor(
        initializer=_init, initargs=(repo_dir,)
    ) as executor:
        commits = executor.map(_hg_log, revs_groups, chunksize=20)
        commits = tqdm(commits, total=len(revs_groups))
        commits = list(itertools.chain.from_iterable(commits))

    # Don't analyze backouts.
    backouts = set(commit.backedoutby for commit in commits if commit.backedoutby != "")
    commits = [commit for commit in commits if commit.node not in backouts]

    # Don't analyze commits that are not linked to a bug.
    commits = [commit for commit in commits if commit.bug != b""]

    print("Downloading file->component mapping...")

    global path_to_component
    r = requests.get(
        "https://index.taskcluster.net/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json"
    )
    r.raise_for_status()
    path_to_component = r.json()
    path_to_component = {
        path: "::".join(component) for path, component in path_to_component.items()
    }

    calculate_experiences(commits)

    # Exclude commits outside the range we care about.
    commits = [commit for commit in commits if commit.pushdate > date_from]

    commits_num = len(commits)

    print(f"Mining {commits_num} commits using {processes} processes...")

    global rs_parsepatch
    import rs_parsepatch

    with concurrent.futures.ProcessPoolExecutor(
        initializer=_init, initargs=(repo_dir,)
    ) as executor:
        commits = executor.map(_transform, commits, chunksize=64)
        commits = tqdm(commits, total=commits_num)
        db.write(COMMITS_DB, commits)
Exemple #27
0
def download_commits(repo_dir, date_from):
    hg = hglib.open(repo_dir)

    revs = get_revs(hg, date_from)

    commits_num = len(revs)

    assert (
        commits_num > 0
    ), "There should definitely be more than 0 commits, something is wrong"

    hg.close()

    processes = multiprocessing.cpu_count()

    print(f"Mining {commits_num} commits using {processes} processes...")

    CHUNK_SIZE = 256
    revs_groups = [
        revs[i:(i + CHUNK_SIZE)] for i in range(0, len(revs), CHUNK_SIZE)
    ]

    with concurrent.futures.ProcessPoolExecutor(
            initializer=_init, initargs=(repo_dir, )) as executor:
        commits = executor.map(_hg_log, revs_groups, chunksize=20)
        commits = tqdm(commits, total=len(revs_groups))
        commits = list(itertools.chain.from_iterable(commits))

    # Don't analyze backouts.
    backouts = set(commit.backedoutby for commit in commits
                   if commit.backedoutby != b"")
    commits = [commit for commit in commits if commit.node not in backouts]

    # Don't analyze commits that are not linked to a bug.
    commits = [commit for commit in commits if commit.bug != b""]

    # Skip commits which are in .hg-annotate-ignore-revs (mostly consisting of very
    # large and not meaningful formatting changes).
    with open(os.path.join(repo_dir, ".hg-annotate-ignore-revs"), "r") as f:
        ignore_revs = set(l[:40].encode("utf-8") for l in f)

    commits = [commit for commit in commits if commit.node not in ignore_revs]

    commits_num = len(commits)

    print(f"Analyzing {commits_num} patches...")

    # Total previous number of commits by the author.
    total_commits_by_author = defaultdict(int)
    # Previous commits by the author, in a 90 days window.
    commits_by_author = defaultdict(list)

    global author_experience
    global author_experience_90_days
    for commit in commits:
        author_experience[commit.node] = total_commits_by_author[commit.author]
        # We don't want to consider backed out commits when calculating author/reviewer experience.
        if not commit.backedoutby:
            total_commits_by_author[commit.author] += 1

        # Keep only the previous commits from a window of 90 days in the commits_by_author map.
        cut = None

        for i, prev_commit in enumerate(commits_by_author[commit.author]):
            if (commit.date - prev_commit.date).days <= 90:
                break

            cut = i

        if cut is not None:
            commits_by_author[commit.author] = commits_by_author[
                commit.author][cut + 1:]

        author_experience_90_days[commit.node] = len(
            commits_by_author[commit.author])

        if not commit.backedoutby:
            commits_by_author[commit.author].append(commit)

    global path_to_component
    r = requests.get(
        "https://index.taskcluster.net/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json"
    )
    r.raise_for_status()
    path_to_component = r.json()
    path_to_component = {
        path: "::".join(component)
        for path, component in path_to_component.items()
    }

    global components_touched_prev
    global components_touched_prev_90_days

    global files_touched_prev
    global files_touched_prev_90_days

    components_touched = defaultdict(int)
    files_touched = defaultdict(int)
    prev_commits_90_days = []
    for commit in commits:
        components = set(path_to_component[path] for path in commit.files
                         if path in path_to_component)

        for component in components:
            components_touched_prev[
                commit.node] += components_touched[component]

            components_touched[component] += 1

        for path in commit.files:
            files_touched_prev[commit.node] += files_touched[path]

            files_touched[path] += 1

        if len(commit.file_copies) > 0:
            for orig, copied in commit.file_copies.items():
                if orig in path_to_component and copied in path_to_component:
                    components_touched[path_to_component[
                        copied]] = components_touched[path_to_component[orig]]

                files_touched[copied] = files_touched[orig]

        for i, prev_commit in enumerate(prev_commits_90_days):
            if (commit.date - prev_commit.date).days <= 90:
                break

            cut = i

        if cut is not None:
            prev_commits_90_days = prev_commits_90_days[cut + 1:]

        components_touched_90_days = defaultdict(int)
        files_touched_90_days = defaultdict(int)
        for prev_commit in prev_commits_90_days:
            components_prev = set(path_to_component[path]
                                  for path in prev_commit.files
                                  if path in path_to_component)

            for component_prev in components_prev:
                components_touched_90_days[component_prev] += 1

            for path_prev in prev_commit.files:
                files_touched_90_days[path_prev] += 1

            if len(prev_commit.file_copies) > 0:
                for orig, copied in prev_commit.file_copies.items():
                    if orig in path_to_component and copied in path_to_component:
                        components_touched_90_days[path_to_component[
                            copied]] = components_touched_90_days[
                                path_to_component[orig]]

                    files_touched_90_days[copied] = files_touched_90_days[orig]

        components_touched_prev_90_days[commit.node] = sum(
            components_touched_90_days[component] for component in components)
        files_touched_prev_90_days[commit.node] = sum(
            files_touched_90_days[path] for path in commit.files)
        prev_commits_90_days.append(commit)

    print(f"Mining commits using {multiprocessing.cpu_count()} processes...")

    with concurrent.futures.ProcessPoolExecutor(
            initializer=_init, initargs=(repo_dir, )) as executor:
        commits = executor.map(_transform, commits, chunksize=64)
        commits = tqdm(commits, total=commits_num)
        db.write(COMMITS_DB, commits)
    def generate_push_data(self, granularity: str, training_months: int,
                           reretrieve: int) -> None:
        # We'll use the past training_months months only for training the model,
        # but we use half training_months months more than that to calculate the
        # failure statistics.
        from_months = training_months + math.floor(training_months / 2)

        # We use the actual date instead of 'today-X' aliases to avoid mozci caching
        # this query.
        from_date = datetime.utcnow() - relativedelta(months=from_months)
        to_date = datetime.utcnow() - relativedelta(days=3)

        pushes = mozci.push.make_push_objects(
            from_date=from_date.strftime("%Y-%m-%d"),
            to_date=to_date.strftime("%Y-%m-%d"),
            branch="autoland",
        )

        if granularity == "label":
            push_data_db = test_scheduling.PUSH_DATA_LABEL_DB
        elif granularity == "group":
            push_data_db = test_scheduling.PUSH_DATA_GROUP_DB
        elif granularity == "config_group":
            push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB

        def cache_key(push: mozci.push.Push) -> str:
            return f"push_data.{granularity}.{push.rev}"

        def generate(
            futures: List[concurrent.futures.Future],
        ) -> Generator[PushResult, None, None]:
            nonlocal reretrieve
            num_cached = 0
            num_pushes = len(pushes)

            for _ in tqdm(range(num_pushes)):
                push = pushes.pop(0)
                cached = futures.pop(0).result()

                semaphore.release()

                # Regenerating a large amount of data when we update the mozci regression detection
                # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we
                # run.
                if cached:
                    value, mozci_version = cached

                    # Regenerate results which were generated with an older version of mozci.
                    if reretrieve > 0 and mozci_version != MOZCI_VERSION:
                        cached = None
                        reretrieve -= 1

                    # Regenerate results which don't contain the fix revision.
                    elif len(value) != 5:
                        cached = None

                if cached:
                    num_cached += 1
                    value, mozci_version = cached
                    assert len(value) == 5
                    yield value
                else:
                    logger.info(
                        f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            tuple(push.revs),
                            push.backedoutby or push.bustage_fixed_by,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        mozci.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            mozci.config["cache"]["retention"],
                        )
                        assert len(value) == 5
                        yield value
                    except mozci.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                    except Exception:
                        traceback.print_exc()

            logger.info(
                f"{num_cached} pushes were already cached out of {num_pushes}")

        semaphore = threading.BoundedSemaphore(256)

        def retrieve_from_cache(push):
            semaphore.acquire()
            return mozci.config.cache.get(cache_key(push))

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(retrieve_from_cache, push) for push in pushes
            ]

            try:
                db.write(push_data_db, generate(futures))
            except Exception:
                for f in futures:
                    f.cancel()

                    try:
                        semaphore.release()
                    except ValueError:
                        continue

                raise

        zstd_compress(push_data_db)
    def generate_push_data(self, pushes: List[mozci.push.Push],
                           granularity: str) -> None:
        # We keep in the cache the fact that we failed to analyze a push for 10
        # days, so if we re-run often we don't retry the same pushes many times.
        MISSING_CACHE_RETENTION = 10 * 24 * 60

        from_date = get_from_date(granularity)

        pushes = [
            push for push in pushes
            if datetime.utcfromtimestamp(push.date) >= from_date
        ]

        if granularity == "label":
            push_data_db = test_scheduling.PUSH_DATA_LABEL_DB
        elif granularity == "group":
            push_data_db = test_scheduling.PUSH_DATA_GROUP_DB
        elif granularity == "config_group":
            push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB

        def cache_key(push: mozci.push.Push) -> str:
            return f"push_data.{granularity}.{push.rev}"

        def generate(executor) -> Generator[PushResult, None, None]:
            num_cached = 0
            num_pushes = len(pushes)

            # Regenerating a large amount of data when we update the mozci regression detection
            # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we
            # run.
            to_regenerate = 1000

            semaphore = threading.BoundedSemaphore(256)

            def retrieve_from_cache(push):
                semaphore.acquire()
                return adr.config.cache.get(cache_key(push))

            futures = tuple(
                executor.submit(retrieve_from_cache, push) for push in pushes)

            for push, future in zip(tqdm(pushes), futures):
                exc = future.exception()
                if exc is not None:
                    logger.info(f"Exception {exc} while getting {push.rev}")
                    for f in futures:
                        f.cancel()

                cached = future.result()

                semaphore.release()

                if cached and to_regenerate > 0:
                    value, mozci_version = cached

                    # Regenerate results which were generated when we were not cleaning
                    # up WPT groups.
                    if any(runnable.startswith("/") for runnable in value[1]):
                        cached = None
                        to_regenerate -= 1
                    """# Regenerate results which were generated with an older version of mozci.
                    elif mozci_version != MOZCI_VERSION and to_regenerate > 0:
                        cached = None
                        to_regenerate -= 1"""

                if cached is not None:
                    num_cached += 1
                    if cached:
                        value, mozci_version = cached
                        yield value
                else:
                    logger.info(
                        f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            push.revs,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        adr.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            adr.config["cache"]["retention"],
                        )
                        yield value
                    except adr.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                        adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)
                    except Exception:
                        traceback.print_exc()
                        adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)

            logger.info(
                f"{num_cached} pushes were already cached out of {num_pushes}")

        with concurrent.futures.ThreadPoolExecutor() as executor:
            db.write(push_data_db, generate(executor))
        zstd_compress(push_data_db)
    def generate_push_data(self, granularity: str) -> None:
        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use half TRAINING_MONTHS months more than that to calculate the
        # failure statistics.
        from_months = TRAINING_MONTHS[granularity] + math.floor(
            TRAINING_MONTHS[granularity] / 2
        )

        # We use the actual date instead of 'today-X' aliases to avoid adr caching
        # this query.
        from_date = datetime.utcnow() - relativedelta(months=from_months)
        to_date = datetime.utcnow() - relativedelta(days=3)

        pushes = mozci.push.make_push_objects(
            from_date=from_date.strftime("%Y-%m-%d"),
            to_date=to_date.strftime("%Y-%m-%d"),
            branch="autoland",
        )

        if granularity == "label":
            push_data_db = test_scheduling.PUSH_DATA_LABEL_DB
        elif granularity == "group":
            push_data_db = test_scheduling.PUSH_DATA_GROUP_DB
        elif granularity == "config_group":
            push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB

        def cache_key(push: mozci.push.Push) -> str:
            return f"push_data.{granularity}.{push.rev}"

        def generate(
            futures: List[concurrent.futures.Future],
        ) -> Generator[PushResult, None, None]:
            num_cached = 0
            num_pushes = len(pushes)

            # Regenerating a large amount of data when we update the mozci regression detection
            # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we
            # run.
            to_regenerate = int(os.environ.get("OLD_RESULTS_TO_REGENERATE", 0))

            for _ in tqdm(range(num_pushes)):
                push = pushes.pop(0)
                cached = futures.pop(0).result()

                semaphore.release()

                if cached and to_regenerate > 0:
                    value, mozci_version = cached

                    # Regenerate results which were generated when we were not cleaning
                    # up WPT groups.
                    if granularity == "group" and any(
                        runnable.startswith("/") for runnable in value[1]
                    ):
                        cached = None
                        to_regenerate -= 1

                    # Regenerate results which were generated when we didn't get a correct
                    # configuration for test-verify tasks.
                    elif granularity == "config_group" and any(
                        "test-verify" in runnable[0] for runnable in value[1]
                    ):
                        cached = None
                        to_regenerate -= 1

                    # Regenerate results which were generated with an older version of mozci.
                    elif mozci_version != MOZCI_VERSION:
                        cached = None
                        to_regenerate -= 1

                if cached:
                    num_cached += 1
                    value, mozci_version = cached
                    yield value
                else:
                    logger.info(f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            push.revs,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        adr.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            adr.config["cache"]["retention"],
                        )
                        yield value
                    except adr.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                    except Exception:
                        traceback.print_exc()

            logger.info(f"{num_cached} pushes were already cached out of {num_pushes}")

        semaphore = threading.BoundedSemaphore(256)

        def retrieve_from_cache(push):
            semaphore.acquire()
            return adr.config.cache.get(cache_key(push))

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(retrieve_from_cache, push) for push in pushes]

            try:
                db.write(push_data_db, generate(futures))
            except Exception:
                for f in futures:
                    f.cancel()

                    try:
                        semaphore.release()
                    except ValueError:
                        continue

                raise

        zstd_compress(push_data_db)
Exemple #31
0
    def retrieve_test_scheduling_history(self):
        os.makedirs("data", exist_ok=True)

        # Download previous cache.
        cache_path = os.path.abspath("data/adr_cache")
        if not os.path.exists(cache_path):
            try:
                download_check_etag(URL, "adr_cache.tar.xz")
                with tarfile.open("adr_cache.tar.xz", "r:xz") as tar:
                    tar.extractall()
                assert os.path.exists(
                    "data/adr_cache"), "Decompressed adr cache exists"
            except requests.exceptions.HTTPError:
                logger.info("The adr cache is not available yet")

        # Setup adr cache configuration.
        os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True)
        with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f:
            f.write(f"""[adr.cache.stores]
file = {{ driver = "file", path = "{cache_path}" }}
            """)

        # Get the commits DB.
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use 3 months more than that to calculate the failure statistics.
        subprocess.run(
            [
                "run-adr",
                "ahal/ci-recipes",
                "recipe",
                "-o",
                os.path.abspath("push_data.json"),
                "-f",
                "json",
                "push_data",
                "--",
                "--from",
                f"today-{TRAINING_MONTHS + 3}month",
                "--to",
                "today-2day",
                "--branch",
                "autoland",
            ],
            check=True,
            stdout=subprocess.
            DEVNULL,  # Redirect to /dev/null, as the logs are too big otherwise.
        )

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        HISTORICAL_TIMESPAN = 56

        past_failures = {}

        def get_past_failures(task, push_num):
            if task not in past_failures:
                past_failures[task] = repository.exp_queue(
                    push_num, HISTORICAL_TIMESPAN + 1, 0)

            return past_failures[task][push_num]

        def generate_data():
            commits_with_data = set()
            saved_nodes = set()

            push_num = 0
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
                        continue

                    total_failures = get_past_failures(task, push_num)
                    past_7_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 7)
                    past_14_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 14)
                    past_28_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 28)
                    past_56_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 56)

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "is_possible_regression": task
                            in commit_push_data[1],
                            "is_likely_regression": task
                            in commit_push_data[2],
                        }

                    if task in commit_push_data[1] or task in commit_push_data[
                            2]:
                        past_failures[task][push_num] = total_failures + 1

                push_num += 1

            logger.info(f"push data nodes: {len(push_data)}")

            logger.info(
                f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.write(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        with tarfile.open("data/adr_cache.tar.xz", "w:xz") as tar:
            tar.add("data/adr_cache")
Exemple #32
0
def download_commits(repo_dir, date_from):
    hg = hglib.open(repo_dir)

    revs = get_revs(hg)

    assert (
        len(revs) > 0
    ), "There should definitely be more than 0 commits, something is wrong"

    hg.close()

    processes = multiprocessing.cpu_count()

    print(f"Mining {len(revs)} commits using {processes} processes...")

    CHUNK_SIZE = 256
    revs_groups = [revs[i : (i + CHUNK_SIZE)] for i in range(0, len(revs), CHUNK_SIZE)]

    with concurrent.futures.ProcessPoolExecutor(
        initializer=_init, initargs=(repo_dir,)
    ) as executor:
        commits = executor.map(_hg_log, revs_groups, chunksize=20)
        commits = tqdm(commits, total=len(revs_groups))
        commits = list(itertools.chain.from_iterable(commits))

    print("Downloading file->component mapping...")

    global path_to_component
    r = requests.get(
        "https://index.taskcluster.net/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json"
    )
    r.raise_for_status()
    path_to_component = r.json()
    path_to_component = {
        path: "::".join(component) for path, component in path_to_component.items()
    }

    commits_to_ignore = get_commits_to_ignore(repo_dir, commits)
    print(f"{len(commits_to_ignore)} commits to ignore")

    calculate_experiences(commits, commits_to_ignore)

    # Exclude commits to ignore.
    commits = [commit for commit in commits if commit not in commits_to_ignore]

    # Exclude commits outside the range we care about.
    commits = [commit for commit in commits if commit.pushdate > date_from]

    commits_num = len(commits)

    print(f"Mining {commits_num} commits using {processes} processes...")

    global rs_parsepatch
    import rs_parsepatch

    with concurrent.futures.ProcessPoolExecutor(
        initializer=_init, initargs=(repo_dir,)
    ) as executor:
        commits = executor.map(_transform, commits, chunksize=64)
        commits = tqdm(commits, total=commits_num)
        db.write(COMMITS_DB, commits)
    def generate_push_data(self, granularity: str) -> None:
        # We keep in the cache the fact that we failed to analyze a push for 10
        # days, so if we re-run often we don't retry the same pushes many times.
        MISSING_CACHE_RETENTION = 10 * 24 * 60

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use half TRAINING_MONTHS months more than that to calculate the
        # failure statistics.
        from_months = TRAINING_MONTHS[granularity] + math.floor(
            TRAINING_MONTHS[granularity] / 2
        )

        # We use the actual date instead of 'today-X' aliases to avoid adr caching
        # this query.
        from_date = datetime.utcnow() - relativedelta(months=from_months)
        to_date = datetime.utcnow() - relativedelta(days=3)

        pushes = mozci.push.make_push_objects(
            from_date=from_date.strftime("%Y-%m-%d"),
            to_date=to_date.strftime("%Y-%m-%d"),
            branch="autoland",
        )

        if granularity == "label":
            push_data_db = test_scheduling.PUSH_DATA_LABEL_DB
        elif granularity == "group":
            push_data_db = test_scheduling.PUSH_DATA_GROUP_DB

        cache: Dict[mozci.push.Push, Tuple[PushResult, int]] = {}

        def cache_key(push: mozci.push.Push) -> str:
            return f"push_data.{granularity}.{push.rev}"

        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_push = {
                executor.submit(
                    lambda push: adr.config.cache.get(cache_key(push)), push
                ): push
                for push in pushes
            }

            for future in tqdm(
                concurrent.futures.as_completed(future_to_push),
                total=len(future_to_push),
            ):
                push = future_to_push[future]

                exc = future.exception()
                if exc is not None:
                    logger.info(f"Exception {exc} while getting {push.rev}")
                    for f in future_to_push.keys():
                        f.cancel()

                cache[push] = future.result()

        # Regenerating a large amount of data when we update the mozci regression detection
        # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we
        # run.
        """to_regenerate = 0
        for push in pushes[::-1]:
            cached = cache[push]
            if not cached:
                continue

            value, mozci_version = cached
            if mozci_version != MOZCI_VERSION and to_regenerate < 1000:
                cache[push] = None
                to_regenerate += 1"""

        to_regenerate = 0
        for push in pushes[::-1]:
            cached = cache[push]
            if not cached:
                continue

            if to_regenerate < 1000:
                del cache[push]
                adr.config.cache.put(push.push_uuid, {}, 0)
                to_regenerate += 1

        def generate() -> Generator[PushResult, None, None]:
            num_cached = 0

            for push in tqdm(pushes):
                key = cache_key(push)

                if push in cache and cache[push] is not None:
                    num_cached += 1
                    cached = cache[push]
                    if cached:
                        value, mozci_version = cached
                        yield value
                else:
                    logger.info(f"Analyzing {push.rev} at the {granularity} level...")

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()

                        value = (
                            push.revs,
                            list(runnables),
                            list(push.get_possible_regressions(granularity)),
                            list(push.get_likely_regressions(granularity)),
                        )
                        adr.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            adr.config["cache"]["retention"],
                        )
                        yield value
                    except adr.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                        adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)
                    except Exception:
                        traceback.print_exc()
                        adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)

            logger.info(f"{num_cached} pushes were already cached out of {len(pushes)}")

        db.write(push_data_db, generate())
        zstd_compress(push_data_db)
    def generate_push_data(
        self, pushes: Tuple[mozci.push.Push, ...], granularity: str
    ) -> None:
        from_date = get_from_date(granularity)

        pushes = tuple(
            push for push in pushes if datetime.utcfromtimestamp(push.date) >= from_date
        )

        if granularity == "label":
            push_data_db = test_scheduling.PUSH_DATA_LABEL_DB
        elif granularity == "group":
            push_data_db = test_scheduling.PUSH_DATA_GROUP_DB
        elif granularity == "config_group":
            push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB

        def cache_key(push: mozci.push.Push) -> str:
            return f"push_data.{granularity}.{push.rev}"

        def generate(
            futures: List[concurrent.futures.Future],
        ) -> Generator[PushResult, None, None]:
            num_cached = 0
            num_pushes = len(pushes)

            # Regenerating a large amount of data when we update the mozci regression detection
            # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we
            # run.
            to_regenerate = 1000

            for push in tqdm(pushes):
                cached = futures.pop(0).result()

                semaphore.release()

                if cached and to_regenerate > 0:
                    value, mozci_version = cached

                    # Regenerate results which were generated when we were not cleaning
                    # up WPT groups.
                    if granularity == "group" and any(
                        runnable.startswith("/") for runnable in value[1]
                    ):
                        cached = None
                        to_regenerate -= 1

                    # Regenerate results which were generated when we didn't get a correct
                    # configuration for test-verify tasks.
                    elif granularity == "config_group" and any(
                        "test-verify" in runnable[0] for runnable in value[1]
                    ):
                        cached = None
                        to_regenerate -= 1

                    """# Regenerate results which were generated with an older version of mozci.
                    elif mozci_version != MOZCI_VERSION:
                        cached = None
                        to_regenerate -= 1"""

                if cached:
                    num_cached += 1
                    value, mozci_version = cached
                    yield value
                else:
                    logger.info(f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            push.revs,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        adr.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            adr.config["cache"]["retention"],
                        )
                        yield value
                    except adr.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                    except Exception:
                        traceback.print_exc()

            logger.info(f"{num_cached} pushes were already cached out of {num_pushes}")

        semaphore = threading.BoundedSemaphore(256)

        def retrieve_from_cache(push):
            semaphore.acquire()
            return adr.config.cache.get(cache_key(push))

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(retrieve_from_cache, push) for push in pushes]

            try:
                db.write(push_data_db, generate(futures))
            except Exception:
                for f in futures:
                    f.cancel()

                    try:
                        semaphore.release()
                    except ValueError:
                        continue

                raise

        zstd_compress(push_data_db)