Beispiel #1
0
    def retrieve_commits(self, limit):
        repository.clone(self.repo_dir)

        if limit:
            # Mercurial revset supports negative integers starting from tip
            rev_start = -limit
        else:
            db.download(repository.COMMITS_DB, support_files_too=True)

            rev_start = 0
            for commit in repository.get_commits():
                rev_start = f"children({commit['node']})"

        with hglib.open(self.repo_dir) as hg:
            revs = repository.get_revs(hg, rev_start)

        chunk_size = 70000

        for i in range(0, len(revs), chunk_size):
            repository.download_commits(self.repo_dir,
                                        revs=revs[i:(i + chunk_size)])

        logger.info("commit data extracted from repository")

        # Some commits that were already in the DB from the previous run might need
        # to be updated (e.g. coverage information).
        repository.update_commits()

        zstd_compress(repository.COMMITS_DB)
        create_tar_zst(os.path.join("data", repository.COMMIT_EXPERIENCES_DB))
Beispiel #2
0
    def __init__(
        self,
        cache_root,
        git_repo_url,
        git_repo_dir,
        tokenized_git_repo_url,
        tokenized_git_repo_dir,
    ):
        self.mercurial_repo_dir = os.path.join(cache_root, "mozilla-central")
        self.git_repo_url = git_repo_url
        self.git_repo_dir = git_repo_dir
        self.tokenized_git_repo_url = tokenized_git_repo_url
        self.tokenized_git_repo_dir = tokenized_git_repo_dir

        logger.info(f"Cloning mercurial repository to {self.mercurial_repo_dir}...")
        repository.clone(self.mercurial_repo_dir)

        logger.info(f"Cloning {self.git_repo_url} to {self.git_repo_dir}...")
        self.clone_git_repo(self.git_repo_url, self.git_repo_dir)
        logger.info(
            f"Cloning {self.tokenized_git_repo_url} to {self.tokenized_git_repo_dir}..."
        )
        self.clone_git_repo(self.tokenized_git_repo_url, self.tokenized_git_repo_dir)
        logger.info(f"Initializing mapping between git and mercurial commits...")
        self.init_mapping()
Beispiel #3
0
    def __init__(self, repo_dir: str) -> None:
        if not os.path.exists(repo_dir):
            repository.clone(repo_dir)
        else:
            repository.pull(repo_dir, "mozilla-central", "tip")

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))
Beispiel #4
0
def boot_worker():
    # Preload models
    bugbug_http.models.preload_models()

    # Clone mozilla central
    repo_dir = os.environ.get("BUGBUG_REPO_DIR",
                              os.path.join(tempfile.gettempdir(), "bugbug-hg"))
    logger.info(f"Cloning mozilla-central in {repo_dir}...")
    repository.clone(repo_dir)

    # Download databases
    logger.info("Downloading test scheduling DB support file...")
    assert (db.download_support_file(
        test_scheduling.TEST_LABEL_SCHEDULING_DB,
        test_scheduling.PAST_FAILURES_LABEL_DB,
    ) or ALLOW_MISSING_MODELS)

    # Download commits DB
    logger.info("Downloading commits DB...")
    commits_db_downloaded = db.download(repository.COMMITS_DB,
                                        support_files_too=True)
    if not ALLOW_MISSING_MODELS:
        assert commits_db_downloaded

    if commits_db_downloaded:
        # And update it
        logger.info("Browsing all commits...")
        for commit in repository.get_commits():
            pass

        rev_start = "children({})".format(commit["node"])
        logger.info("Updating commits DB...")
        repository.download_commits(repo_dir, rev_start)

    logger.info("Worker boot done")
Beispiel #5
0
    def __init__(self, repo_dir: str) -> None:
        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = download_and_load_model("regressor")

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))
Beispiel #6
0
def boot_worker():
    # Clone autoland
    logger.info(f"Cloning mozilla autoland in {REPO_DIR}...")
    repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland")

    # Download test scheduling DB support files.
    logger.info("Downloading test scheduling DB support files...")
    assert (db.download_support_file(
        test_scheduling.TEST_LABEL_SCHEDULING_DB,
        test_scheduling.PAST_FAILURES_LABEL_DB,
    ) or ALLOW_MISSING_MODELS)

    assert (db.download_support_file(
        test_scheduling.TEST_GROUP_SCHEDULING_DB,
        test_scheduling.PAST_FAILURES_GROUP_DB,
    ) or ALLOW_MISSING_MODELS)

    assert (db.download_support_file(
        test_scheduling.TEST_GROUP_SCHEDULING_DB,
        test_scheduling.TOUCHED_TOGETHER_DB,
    ) or ALLOW_MISSING_MODELS)

    # Download commits DB
    logger.info("Downloading commits DB...")
    commits_db_downloaded = db.download(repository.COMMITS_DB,
                                        support_files_too=True)
    if not ALLOW_MISSING_MODELS:
        assert commits_db_downloaded

    if commits_db_downloaded:
        # And update it
        logger.info("Browsing all commits...")
        for commit in repository.get_commits():
            pass

        rev_start = "children({})".format(commit["node"])
        logger.info("Updating commits DB...")
        commits = repository.download_commits(REPO_DIR,
                                              rev_start,
                                              use_single_process=True)

        if len(commits) > 0:
            # Update the touched together DB.
            update_touched_together_gen = test_scheduling.update_touched_together(
            )
            next(update_touched_together_gen)

            update_touched_together_gen.send(commits[-1]["node"])

            try:
                update_touched_together_gen.send(None)
            except StopIteration:
                pass

    # Preload models
    bugbug_http.models.preload_models()

    logger.info("Worker boot done")
Beispiel #7
0
    def update_commit_db(self):
        repository.clone(self.repo_dir, update=True)

        assert db.download(repository.COMMITS_DB, support_files_too=True)

        for commit in repository.get_commits():
            pass

        repository.download_commits(self.repo_dir,
                                    rev_start="children({})".format(
                                        commit["node"]))
Beispiel #8
0
    def generate(self):
        repository.clone(self.repo_dir)

        logger.info("mozilla-central cloned")

        git_user = get_secret("GIT_USER")
        git_password = get_secret("GIT_PASSWORD")

        repo_push_url = self.repo_url.replace(
            "https://", f"https://{git_user}:{git_password}@")
        git_repo_path = os.path.basename(self.repo_url)

        retry(lambda: subprocess.run(
            ["git", "clone", self.repo_url, git_repo_path], check=True))

        try:
            retry(lambda: subprocess.run(
                ["git", "pull", self.repo_url, "master"],
                cwd=git_repo_path,
                capture_output=True,
                check=True,
            ))
        except subprocess.CalledProcessError as e:
            # When the repo is empty.
            if b"Couldn't find remote ref master" in e.stdout:
                pass

        retry(lambda: subprocess.run(
            ["git", "config", "--global", "http.postBuffer", "12M"],
            check=True))

        for i in range(STEPS):
            logger.info(f"Step {i} out of {STEPS}")

            done = generator.generate(
                self.repo_dir,
                git_repo_path,
                limit=TOTAL_COMMITS // STEPS,
                tokenize=self.tokenize,
                remove_comments=self.remove_comments,
            )

            with open("done", "w") as f:
                f.write(str(1 if done else 0))

            retry(lambda: subprocess.run(
                ["git", "push", repo_push_url, "master"],
                cwd=git_repo_path,
                check=True,
            ))

            if done:
                break
    def update_commit_db(self):
        repository.clone(self.repo_dir)

        if db.is_old_version(repository.COMMITS_DB) or not db.exists(
            repository.COMMITS_DB
        ):
            db.download(repository.COMMITS_DB, force=True, support_files_too=True)

        for commit in repository.get_commits():
            pass

        rev_start = "children({})".format(commit["node"])

        repository.download_commits(self.repo_dir, rev_start)
Beispiel #10
0
    def retrieve_commits(self):
        repository.clone(self.repo_dir)

        if not db.is_old_version(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, support_files_too=True)

            for commit in repository.get_commits():
                pass

            rev_start = f"children({commit['node']})"
        else:
            rev_start = 0

        repository.download_commits(self.repo_dir, rev_start)

        logger.info("commit data extracted from repository")

        self.compress_file("data/commits.json")
        self.compress_file("data/commit_experiences.pickle")
Beispiel #11
0
    def retrieve_commits(self, limit):
        repository.clone(self.repo_dir)

        if limit:
            # Mercurial revset supports negative integers starting from tip
            rev_start = -limit
        else:
            db.download(repository.COMMITS_DB, support_files_too=True)

            rev_start = 0
            for commit in repository.get_commits():
                rev_start = f"children({commit['node']})"

        repository.download_commits(self.repo_dir, rev_start)

        logger.info("commit data extracted from repository")

        zstd_compress("data/commits.json")
        zstd_compress("data/commit_experiences.pickle")
Beispiel #12
0
    def retrieve_commits(self, limit):
        repository.clone(self.repo_dir)

        if limit:
            # Mercurial revset supports negative integers starting from tip
            rev_start = -limit
        else:
            db.download(repository.COMMITS_DB, support_files_too=True)

            rev_start = 0
            for commit in repository.get_commits():
                rev_start = f"children({commit['node']})"

        repository.download_commits(self.repo_dir, rev_start=rev_start)

        logger.info("commit data extracted from repository")

        zstd_compress(repository.COMMITS_DB)
        create_tar_zst(os.path.join("data", repository.COMMIT_EXPERIENCES_DB))
    def __init__(self, repo_dir: str) -> None:
        self.risk_bands = sorted(
            (
                parse_risk_band(risk_band)
                for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")
            ),
            key=lambda x: x[1],
        )

        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = cast(
            RegressorModel, RegressorModel.load(download_model("regressor"))
        )

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(
            get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")
        )
Beispiel #14
0
    def retrieve_commits(self, limit):
        repository.clone(self.repo_dir)

        if not db.is_old_version(repository.COMMITS_DB) and not limit:
            db.download(repository.COMMITS_DB, support_files_too=True)

            for commit in repository.get_commits():
                pass

            rev_start = f"children({commit['node']})"
        else:
            if limit:
                rev_start = (
                    -1 * limit
                )  # Mercurial revset support negative integers starting from tip
            else:
                rev_start = 0

        repository.download_commits(self.repo_dir, rev_start)

        logger.info("commit data extracted from repository")

        zstd_compress("data/commits.json")
        zstd_compress("data/commit_experiences.pickle")
Beispiel #15
0
 def clone_autoland():
     logger.info(f"Cloning autoland in {REPO_DIR}...")
     repository.clone(REPO_DIR,
                      "https://hg.mozilla.org/integration/autoland")
    def __init__(self, repo_dir: str) -> None:
        self.risk_bands = sorted(
            (parse_risk_band(risk_band)
             for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")),
            key=lambda x: x[1],
        )

        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        # Some commits that were already in the DB from the previous run might need
        # to be updated (e.g. coverage information).
        repository.update_commits()

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = cast(
            RegressorModel, RegressorModel.load(download_model("regressor")))

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))

        self.path_to_component = repository.get_component_mapping()

        self.past_regressions_by = {}
        self.past_fixed_bugs_by = {}
        self.past_regression_blocked_bugs_by = {}
        self.past_fixed_bug_blocked_bugs_by = {}

        for dimension in ["component", "directory", "file", "function"]:
            self.past_regressions_by[dimension] = _download_past_bugs(
                PAST_REGRESSIONS_BY_URL.format(dimension=dimension))
            self.past_fixed_bugs_by[dimension] = _download_past_bugs(
                PAST_FIXED_BUGS_BY_URL.format(dimension=dimension))
            self.past_regression_blocked_bugs_by[
                dimension] = _download_past_bugs(
                    PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format(
                        dimension=dimension))
            self.past_fixed_bug_blocked_bugs_by[
                dimension] = _download_past_bugs(
                    PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format(
                        dimension=dimension))
Beispiel #17
0
def find_bug_introducing_commits(cache_dir, git_repo_dir):
    mercurial_repo_dir = os.path.join(cache_dir, "mozilla-central")

    logger.info("Downloading Mercurial <-> git mapping file...")
    vcs_map.download_mapfile()

    logger.info(f"Cloning mercurial repository to {mercurial_repo_dir}...")
    repository.clone(mercurial_repo_dir)

    logger.info(f"Cloning git repository to {git_repo_dir}...")
    clone_gecko_dev(git_repo_dir)

    logger.info("Download previously found bug-introducing commits...")
    db.download_version(BUG_INTRODUCING_COMMITS_DB)
    if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists(
        BUG_INTRODUCING_COMMITS_DB
    ):
        db.download(BUG_INTRODUCING_COMMITS_DB, force=True)

    logger.info("Get previously found bug-introducing commits...")
    prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB))
    prev_bug_introducing_commits_nodes = set(
        bug_introducing_commit["bug_fixing_mercurial_rev"]
        for bug_introducing_commit in prev_bug_introducing_commits
    )
    logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...")

    commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir)

    git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore)

    with open("git_hashes_to_ignore", "w") as f:
        f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore)

    bug_fixing_commits = find_bug_fixing_commits()

    logger.info(f"{len(bug_fixing_commits)} commits to analyze")

    # Skip already found bug-introducing commits.
    bug_fixing_commits = [
        bug_fixing_commit
        for bug_fixing_commit in bug_fixing_commits
        if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes
    ]

    logger.info(
        f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
    )

    bug_fixing_commits = [
        bug_fixing_commit
        for bug_fixing_commit in bug_fixing_commits
        if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore
    ]
    logger.info(
        f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
    )

    def _init(git_repo_dir):
        global GIT_REPO
        GIT_REPO = GitRepository(git_repo_dir)

    def find_bic(bug_fixing_commit):
        logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"]))

        commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"])

        # Skip huge changes, we'll likely be wrong with them.
        if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
            return [None]

        bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines(
            commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")
        )
        logger.info(bug_introducing_modifications)

        bug_introducing_commits = []
        for bug_introducing_hashes in bug_introducing_modifications.values():
            for bug_introducing_hash in bug_introducing_hashes:
                bug_introducing_commits.append(
                    {
                        "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
                        "bug_fixing_git_rev": bug_fixing_commit["git_rev"],
                        "bug_introducing_mercurial_rev": vcs_map.git_to_mercurial(
                            bug_introducing_hash
                        ),
                        "bug_introducing_git_rev": bug_introducing_hash,
                    }
                )

        # Add an empty result, just so that we don't reanalyze this again.
        if len(bug_introducing_commits) == 0:
            bug_introducing_commits.append(
                {
                    "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
                    "bug_fixing_git_rev": bug_fixing_commit["git_rev"],
                    "bug_introducing_mercurial_rev": "",
                    "bug_introducing_git_rev": "",
                }
            )

        return bug_introducing_commits

    with concurrent.futures.ThreadPoolExecutor(
        initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1
    ) as executor:
        bug_introducing_commits = executor.map(find_bic, bug_fixing_commits)
        bug_introducing_commits = tqdm(
            bug_introducing_commits, total=len(bug_fixing_commits)
        )
        bug_introducing_commits = list(
            itertools.chain.from_iterable(bug_introducing_commits)
        )

    total_results_num = len(bug_introducing_commits)
    bug_introducing_commits = list(filter(None, bug_introducing_commits))
    logger.info(
        f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
    )

    db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits)
    compress_file(BUG_INTRODUCING_COMMITS_DB)