예제 #1
0
파일: test_db.py 프로젝트: rajathans/bugbug
def test_download_version(tmp_path):
    url_zst = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.zst"
    url_version = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.version"

    db_path = tmp_path / "prova.json"
    db.register(db_path, url_zst, 1, support_files=[])

    responses.add(responses.HEAD,
                  url_version,
                  status=200,
                  headers={"ETag": "123"})

    responses.add(responses.GET, url_version, status=200, body="42")

    db.download_version(db_path)

    assert os.path.exists(db_path.with_suffix(db_path.suffix + ".version"))
    assert os.path.exists(db_path.with_suffix(db_path.suffix +
                                              ".version.etag"))

    assert not db.is_old_version(db_path)

    db.register(db_path, url_zst, 43, support_files=[])

    assert db.is_old_version(db_path)
예제 #2
0
    def update_commit_db(self):
        repository.clone(self.repo_dir)

        db.download_version(repository.COMMITS_DB)
        if db.is_old_version(repository.COMMITS_DB) or not os.path.exists(
            repository.COMMITS_DB
        ):
            db.download(repository.COMMITS_DB, force=True, support_files_too=True)

        for commit in repository.get_commits():
            pass

        rev_start = "children({})".format(commit["node"])

        repository.download_commits(self.repo_dir, rev_start)
예제 #3
0
    def retrieve_commits(self):
        shared_dir = self.repo_dir + "-shared"
        cmd = hglib.util.cmdbuilder(
            "robustcheckout",
            "https://hg.mozilla.org/mozilla-central",
            self.repo_dir,
            purge=True,
            sharebase=shared_dir,
            networkattempts=7,
            branch=b"tip",
        )

        cmd.insert(0, hglib.HGPATH)

        proc = hglib.util.popen(cmd)
        out, err = proc.communicate()
        if proc.returncode:
            raise hglib.error.CommandError(cmd, proc.returncode, out, err)

        logger.info("mozilla-central cloned")

        try:
            os.remove(os.path.join(self.repo_dir, ".hg", "pushlog2.db"))
        except FileNotFoundError:
            logger.info("pushlog database doesn't exist")

        # Pull and update, to make sure the pushlog is generated.
        hg = hglib.open(self.repo_dir)
        hg.pull(update=True)
        hg.close()

        db.download_version(repository.COMMITS_DB)
        if not db.is_old_version(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, support_files_too=True)

            for commit in repository.get_commits():
                pass

            rev_start = f"children({commit['node']})"
        else:
            rev_start = 0

        repository.download_commits(self.repo_dir, rev_start)

        logger.info("commit data extracted from repository")

        self.compress_file("data/commits.json")
        self.compress_file("data/commit_experiences.pickle")
예제 #4
0
    def __init__(self):
        self.model_class = RegressorModel

        self.repo_dir = get_login_info()["repo_dir"]

        if not os.path.exists(self.repo_dir):
            cmd = hglib.util.cmdbuilder(
                "robustcheckout",
                "https://hg.mozilla.org/mozilla-central",
                self.repo_dir,
                purge=True,
                sharebase=self.repo_dir + "-shared",
                networkattempts=7,
                branch=b"tip",
            )

            cmd.insert(0, hglib.HGPATH)

            proc = hglib.util.popen(cmd)
            out, err = proc.communicate()
            if proc.returncode:
                raise hglib.error.CommandError(cmd, proc.returncode, out, err)

            logger.info("mozilla-central cloned")

            # Remove pushlog DB to make sure it's regenerated.
            try:
                os.remove(os.path.join(self.repo_dir, ".hg", "pushlog2.db"))
            except FileNotFoundError:
                logger.info("pushlog database doesn't exist")

        logger.info("Pulling and updating mozilla-central")
        with hglib.open(self.repo_dir) as hg:
            hg.pull(update=True)
        logger.info("mozilla-central pulled and updated")

        db.download_version(repository.COMMITS_DB)
        if db.is_old_version(repository.COMMITS_DB) or not os.path.exists(
                repository.COMMITS_DB):
            db.download(repository.COMMITS_DB,
                        force=True,
                        support_files_too=True)

        super().__init__()
        self.model = self.model_class.load(self.retrieve_model())
예제 #5
0
    def retrieve_commits(self):
        repository.clone(self.repo_dir)

        db.download_version(repository.COMMITS_DB)
        if not db.is_old_version(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, support_files_too=True)

            for commit in repository.get_commits():
                pass

            rev_start = f"children({commit['node']})"
        else:
            rev_start = 0

        repository.download_commits(self.repo_dir, rev_start)

        logger.info("commit data extracted from repository")

        self.compress_file("data/commits.json")
        self.compress_file("data/commit_experiences.pickle")
예제 #6
0
    def retrieve_bugs(self):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download_version(bugzilla.BUGS_DB)
        if not db.is_old_version(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids({
            "f1": "delta_ts",
            "o1": "greaterthaneq",
            "v1": last_modified.date()
        })
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago,
                                                six_months_ago)
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
        db.download_version(repository.COMMITS_DB)
        if db.is_old_version(repository.COMMITS_DB) or not os.path.exists(
                repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
        start_date = datetime.now() - relativedelta(years=2, months=6)
        commit_bug_ids = [
            commit["bug_id"] for commit in repository.get_commits()
            if commit["bug_id"]
            and dateutil.parser.parse(commit["pushdate"]) >= start_date
        ]
        logger.info(
            f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which caused regressions fixed by commits (useful for the regressor model).
        regressed_by_bug_ids = sum(
            [
                bug["regressed_by"]
                for bug in bugzilla.get_bugs() if bug["id"] in commit_bug_ids
            ],
            [],
        )
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids +
                   regressed_by_bug_ids)

        all_ids = timespan_ids + labelled_bug_ids + commit_bug_ids
        all_ids_set = set(all_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"]
                             not in all_ids_set)

        bugzilla.download_bugs(all_ids)

        # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs).
        regressed_by_bug_ids = sum(
            [
                bug["regressed_by"]
                for bug in bugzilla.get_bugs() if bug["id"] in commit_bug_ids
            ],
            [],
        )
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        bugzilla.download_bugs(regressed_by_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs()
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        self.compress_file("data/bugs.json")
예제 #7
0
    def retrieve_bugs(self):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download_version(bugzilla.BUGS_DB)
        if not db.is_old_version(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids({
            "f1": "delta_ts",
            "o1": "greaterthaneq",
            "v1": last_modified.date()
        })
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago,
                                                six_months_ago)
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        all_ids = set(timespan_ids + labelled_bug_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(
            lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids)

        bugzilla.download_bugs(timespan_ids + labelled_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs()
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        self.compress_file("data/bugs.json")
예제 #8
0
    def find_bug_introducing_commits(self, bug_fixing_commits,
                                     commits_to_ignore, tokenized):
        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.tokenized_git_repo_dir
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.git_repo_dir

        def git_to_mercurial(rev):
            if tokenized:
                return self.tokenized_git_to_mercurial[rev]
            else:
                return vcs_map.git_to_mercurial(rev)

        def mercurial_to_git(rev):
            if tokenized:
                return self.mercurial_to_tokenized_git[rev]
            else:
                return vcs_map.mercurial_to_git(rev)

        logger.info("Download previously found bug-introducing commits...")
        db.download_version(db_path)
        if db.is_old_version(db_path) or not os.path.exists(db_path):
            db.download(db_path, force=True)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits)
        logger.info(
            f"Already classified {len(prev_bug_introducing_commits)} commits..."
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            f.writelines("{}\n".format(mercurial_to_git(commit["rev"]))
                         for commit in commits_to_ignore if not tokenized
                         or commit["rev"] in self.mercurial_to_tokenized_git)

        logger.info(f"{len(bug_fixing_commits)} commits to analyze")

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if
            bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
        )

        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash"
            )

        def _init(git_repo_dir):
            global GIT_REPO
            GIT_REPO = GitRepository(git_repo_dir)

        def find_bic(bug_fixing_commit):
            git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"])

            logger.info(f"Analyzing {git_fix_revision}...")

            commit = GIT_REPO.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                return [None]

            bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines(
                commit,
                hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore"))
            logger.info(bug_introducing_modifications)

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values(
            ):
                for bug_introducing_hash in bug_introducing_hashes:
                    bug_introducing_commits.append({
                        "bug_fixing_rev":
                        bug_fixing_commit["rev"],
                        "bug_introducing_rev":
                        git_to_mercurial(bug_introducing_hash),
                    })

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append({
                    "bug_fixing_rev":
                    bug_fixing_commit["rev"],
                    "bug_introducing_rev":
                    "",
                })

            return bug_introducing_commits

        with concurrent.futures.ThreadPoolExecutor(initializer=_init,
                                                   initargs=(repo_dir, ),
                                                   max_workers=os.cpu_count() +
                                                   1) as executor:
            bug_introducing_commits = executor.map(find_bic,
                                                   bug_fixing_commits)
            bug_introducing_commits = tqdm(bug_introducing_commits,
                                           total=len(bug_fixing_commits))
            bug_introducing_commits = list(
                itertools.chain.from_iterable(bug_introducing_commits))

        total_results_num = len(bug_introducing_commits)
        bug_introducing_commits = list(filter(None, bug_introducing_commits))
        logger.info(
            f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
        )

        db.append(db_path, bug_introducing_commits)
        compress_file(db_path)
예제 #9
0
    def find_bug_fixing_commits(self):
        logger.info("Downloading commits database...")
        db.download_version(repository.COMMITS_DB)
        if db.is_old_version(repository.COMMITS_DB) or not os.path.exists(
                repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        logger.info("Downloading bugs database...")
        db.download_version(bugzilla.BUGS_DB)
        if db.is_old_version(
                bugzilla.BUGS_DB) or not os.path.exists(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB, force=True)

        logger.info("Download previous classifications...")
        db.download_version(BUG_FIXING_COMMITS_DB)
        if db.is_old_version(BUG_FIXING_COMMITS_DB
                             ) or not os.path.exists(BUG_FIXING_COMMITS_DB):
            db.download(BUG_FIXING_COMMITS_DB, force=True)

        logger.info("Get previously classified commits...")
        prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))
        prev_bug_fixing_commits_nodes = set(
            bug_fixing_commit["rev"]
            for bug_fixing_commit in prev_bug_fixing_commits)
        logger.info(
            f"Already classified {len(prev_bug_fixing_commits)} commits...")

        # TODO: Switch to the pure Defect model, as it's better in this case.
        logger.info("Downloading defect/enhancement/task model...")
        download_model("defectenhancementtask")
        defect_model = DefectEnhancementTaskModel.load(
            "defectenhancementtaskmodel")

        logger.info("Downloading regression model...")
        download_model("regression")
        regression_model = RegressionModel.load("regressionmodel")

        start_date = datetime.now() - RELATIVE_START_DATE
        end_date = datetime.now() - RELATIVE_END_DATE
        logger.info(
            f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
        )
        commit_map = defaultdict(list)
        for commit in repository.get_commits():
            if commit["node"] in prev_bug_fixing_commits_nodes:
                continue

            commit_date = dateutil.parser.parse(commit["pushdate"])
            if commit_date < start_date or commit_date > end_date:
                continue

            commit_map[commit["bug_id"]].append(commit["node"])

        logger.info(
            f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits"
        )
        assert len(commit_map) > 0

        def get_relevant_bugs():
            return (bug for bug in bugzilla.get_bugs()
                    if bug["id"] in commit_map)

        bug_count = sum(1 for bug in get_relevant_bugs())
        logger.info(
            f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing"
        )

        known_defect_labels = defect_model.get_labels()
        known_regression_labels = regression_model.get_labels()

        bug_fixing_commits = []

        def append_bug_fixing_commits(bug_id, type_):
            for commit in commit_map[bug_id]:
                bug_fixing_commits.append({"rev": commit, "type": type_})

        for bug in tqdm(get_relevant_bugs(), total=bug_count):
            # Ignore bugs which are not linked to the commits we care about.
            if bug["id"] not in commit_map:
                continue

            # If we know the label already, we don't need to apply the model.
            if (bug["id"] in known_regression_labels
                    and known_regression_labels[bug["id"]] == 1):
                append_bug_fixing_commits(bug["id"], "r")
                continue

            if bug["id"] in known_defect_labels:
                if known_defect_labels[bug["id"]] == "defect":
                    append_bug_fixing_commits(bug["id"], "d")
                else:
                    append_bug_fixing_commits(bug["id"], "e")
                continue

            if defect_model.classify(bug)[0] == "defect":
                if regression_model.classify(bug)[0] == 1:
                    append_bug_fixing_commits(bug["id"], "r")
                else:
                    append_bug_fixing_commits(bug["id"], "d")
            else:
                append_bug_fixing_commits(bug["id"], "e")

        db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
        compress_file(BUG_FIXING_COMMITS_DB)

        bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits
        return [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["type"] in ["r", "d"]
        ]
예제 #10
0
def find_bug_introducing_commits(cache_dir, git_repo_dir):
    mercurial_repo_dir = os.path.join(cache_dir, "mozilla-central")

    logger.info("Downloading Mercurial <-> git mapping file...")
    vcs_map.download_mapfile()

    logger.info(f"Cloning mercurial repository to {mercurial_repo_dir}...")
    repository.clone(mercurial_repo_dir)

    logger.info(f"Cloning git repository to {git_repo_dir}...")
    clone_gecko_dev(git_repo_dir)

    logger.info("Download previously found bug-introducing commits...")
    db.download_version(BUG_INTRODUCING_COMMITS_DB)
    if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists(
        BUG_INTRODUCING_COMMITS_DB
    ):
        db.download(BUG_INTRODUCING_COMMITS_DB, force=True)

    logger.info("Get previously found bug-introducing commits...")
    prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB))
    prev_bug_introducing_commits_nodes = set(
        bug_introducing_commit["bug_fixing_mercurial_rev"]
        for bug_introducing_commit in prev_bug_introducing_commits
    )
    logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...")

    commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir)

    git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore)

    with open("git_hashes_to_ignore", "w") as f:
        f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore)

    bug_fixing_commits = find_bug_fixing_commits()

    logger.info(f"{len(bug_fixing_commits)} commits to analyze")

    # Skip already found bug-introducing commits.
    bug_fixing_commits = [
        bug_fixing_commit
        for bug_fixing_commit in bug_fixing_commits
        if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes
    ]

    logger.info(
        f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
    )

    bug_fixing_commits = [
        bug_fixing_commit
        for bug_fixing_commit in bug_fixing_commits
        if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore
    ]
    logger.info(
        f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
    )

    def _init(git_repo_dir):
        global GIT_REPO
        GIT_REPO = GitRepository(git_repo_dir)

    def find_bic(bug_fixing_commit):
        logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"]))

        commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"])

        # Skip huge changes, we'll likely be wrong with them.
        if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
            return [None]

        bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines(
            commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")
        )
        logger.info(bug_introducing_modifications)

        bug_introducing_commits = []
        for bug_introducing_hashes in bug_introducing_modifications.values():
            for bug_introducing_hash in bug_introducing_hashes:
                bug_introducing_commits.append(
                    {
                        "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
                        "bug_fixing_git_rev": bug_fixing_commit["git_rev"],
                        "bug_introducing_mercurial_rev": vcs_map.git_to_mercurial(
                            bug_introducing_hash
                        ),
                        "bug_introducing_git_rev": bug_introducing_hash,
                    }
                )

        # Add an empty result, just so that we don't reanalyze this again.
        if len(bug_introducing_commits) == 0:
            bug_introducing_commits.append(
                {
                    "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
                    "bug_fixing_git_rev": bug_fixing_commit["git_rev"],
                    "bug_introducing_mercurial_rev": "",
                    "bug_introducing_git_rev": "",
                }
            )

        return bug_introducing_commits

    with concurrent.futures.ThreadPoolExecutor(
        initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1
    ) as executor:
        bug_introducing_commits = executor.map(find_bic, bug_fixing_commits)
        bug_introducing_commits = tqdm(
            bug_introducing_commits, total=len(bug_fixing_commits)
        )
        bug_introducing_commits = list(
            itertools.chain.from_iterable(bug_introducing_commits)
        )

    total_results_num = len(bug_introducing_commits)
    bug_introducing_commits = list(filter(None, bug_introducing_commits))
    logger.info(
        f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
    )

    db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits)
    compress_file(BUG_INTRODUCING_COMMITS_DB)