Ejemplo n.º 1
0
    def retrieve_revisions(self, limit: Optional[int] = None) -> None:
        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))

        db.download(phabricator.REVISIONS_DB)

        # Get the commits DB, as we need it to get the revision IDs linked to recent commits.
        assert db.download(repository.COMMITS_DB)

        # Get the bugs DB, as we need it to get the revision IDs linked to bugs.
        assert db.download(bugzilla.BUGS_DB)

        # Get IDs of revisions linked to commits since a year ago.
        start_date = datetime.utcnow() - relativedelta(years=1)
        revision_ids = list((filter(
            None,
            (repository.get_revision_id(commit)
             for commit in repository.get_commits()
             if dateutil.parser.parse(commit["pushdate"]) >= start_date),
        )))
        if limit is not None:
            revision_ids = revision_ids[-limit:]

        # Get IDs of revisions linked to bugs since a year ago.
        for bug in bugzilla.get_bugs():
            if (dateutil.parser.parse(
                    bug["creation_time"]).replace(tzinfo=None) < start_date):
                continue

            revision_ids += bugzilla.get_revision_ids(bug)

        phabricator.download_revisions(revision_ids)

        zstd_compress(phabricator.REVISIONS_DB)
Ejemplo n.º 2
0
    def __init__(self, repo_dir: str) -> None:
        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = download_and_load_model("regressor")

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))
Ejemplo n.º 3
0
    def __init__(self, repo_dir: str) -> None:
        if not os.path.exists(repo_dir):
            repository.clone(repo_dir)
        else:
            repository.pull(repo_dir, "mozilla-central", "tip")

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))
Ejemplo n.º 4
0
    def apply_phab(self, hg, diff_id):
        phabricator_api = PhabricatorAPI(
            api_key=get_secret("PHABRICATOR_TOKEN"),
            url=get_secret("PHABRICATOR_URL"))

        diffs = phabricator_api.search_diffs(diff_id=diff_id)
        assert len(diffs) == 1, "No diff available for {}".format(diff_id)
        diff = diffs[0]

        # Get the stack of patches
        base, patches = phabricator_api.load_patches_stack(hg, diff)
        assert len(patches) > 0, "No patches to apply"

        # Load all the diffs details with commits messages
        diffs = phabricator_api.search_diffs(diff_phid=[p[0] for p in patches],
                                             attachments={"commits": True})
        commits = {
            diff["phid"]: diff["attachments"]["commits"].get("commits", [])
            for diff in diffs
        }

        # First apply patches on local repo
        for diff_phid, patch in patches:
            commit = commits.get(diff_phid)

            message = ""
            if commit:
                message += "{}\n".format(commit[0]["message"])

            logger.info(f"Applying {diff_phid}")
            hg.import_(
                patches=io.BytesIO(patch.encode("utf-8")),
                message=message,
                user="******",
            )
Ejemplo n.º 5
0
    def classify_test_select(self, commits, runnable_jobs_path):
        testfailure_probs = self.testfailure_model.classify(commits[-1],
                                                            probabilities=True)

        logger.info(f"Test failure risk: {testfailure_probs[0][1]}")

        if not runnable_jobs_path:
            runnable_jobs = {}
        elif runnable_jobs_path.startswith("http"):
            r = requests.get(runnable_jobs_path)
            r.raise_for_status()
            runnable_jobs = r.json()
        else:
            with open(runnable_jobs_path, "r") as f:
                runnable_jobs = json.load(f)

        # XXX: Remove tasks which are not in runnable jobs right away, so we avoid classifying them.
        # XXX: Consider using mozilla-central built-in rules to filter some of the tasks out, e.g. SCHEDULES.

        selected_tasks = list(
            self.model.select_tests(
                commits,
                float(get_secret(
                    "TEST_SELECTION_CONFIDENCE_THRESHOLD"))).values())

        # XXX: For now, only restrict to linux64 test tasks (as for runnable jobs above, we could remove these right away).
        selected_tasks = [
            t for t in selected_tasks if t.startswith("test-linux1804-64/")
        ]

        with open("failure_risk", "w") as f:
            f.write("1" if testfailure_probs[0][1] > float(
                get_secret("TEST_FAILURE_CONFIDENCE_THRESHOLD")) else "0")

        # This should be kept in sync with the test scheduling history retriever script.
        if len(runnable_jobs) > 0:
            cleaned_selected_tasks = []
            for selected_task in selected_tasks:
                if (selected_task.startswith("test-linux64")
                        and selected_task not in runnable_jobs):
                    selected_task = selected_task.replace(
                        "test-linux64-", "test-linux1804-64-")

                if (selected_task.startswith("test-linux1804-64-")
                        and selected_task not in runnable_jobs):
                    selected_task = selected_task.replace(
                        "test-linux1804-64-", "test-linux64-")

                if selected_task in runnable_jobs:
                    cleaned_selected_tasks.append(selected_task)

        # It isn't worth running the build associated to the tests, if we only run three test tasks.
        if len(cleaned_selected_tasks) < 3:
            cleaned_selected_tasks = []

        with open("selected_tasks", "w") as f:
            f.writelines(f"{selected_task}\n"
                         for selected_task in cleaned_selected_tasks)
Ejemplo n.º 6
0
    def generate(self):
        db_path = os.path.join("data", self.git_repo_path)
        db.register(
            db_path,
            "https://s3-us-west-2.amazonaws.com/communitytc-bugbug/data/",
            VERSION,
        )

        is_old_version = db.is_old_schema(db_path)

        with ThreadPoolExecutorResult(max_workers=2) as executor:
            cloner = executor.submit(repository.clone, self.repo_dir)
            cloner.add_done_callback(
                lambda future: logger.info("mozilla-central cloned")
            )

            git_user = get_secret("GIT_USER")
            git_password = get_secret("GIT_PASSWORD")

            repo_push_url = self.repo_url.replace(
                "https://", f"https://{git_user}:{git_password}@"
            )

            if not is_old_version:
                executor.submit(self.clone_git_repo)
            else:
                executor.submit(self.init_git_repo)

        tenacity.retry(
            lambda: subprocess.run(
                ["git", "config", "--global", "http.postBuffer", "12M"], check=True
            ),
            wait=tenacity.wait_fixed(30),
            stop=tenacity.stop_after_attempt(5),
        )()

        push_args = ["git", "push", repo_push_url, "master"]
        if is_old_version:
            push_args.append("--force")

        done = False
        while not done:
            done = generator.generate(
                self.repo_dir,
                self.git_repo_path,
                limit=COMMITS_STEP,
                tokenize=self.tokenize,
                remove_comments=self.remove_comments,
            )

            tenacity.retry(
                lambda: subprocess.run(push_args, cwd=self.git_repo_path, check=True),
                wait=tenacity.wait_fixed(30),
                stop=tenacity.stop_after_attempt(5),
            )()

            # We are not using db.upload as we don't need to upload the git repo.
            upload_s3([f"{db_path}.version"])
    def generate(self):
        shared_dir = self.repo_dir + "-shared"
        cmd = hglib.util.cmdbuilder(
            "robustcheckout",
            "https://hg.mozilla.org/mozilla-central",
            self.repo_dir,
            purge=True,
            sharebase=shared_dir,
            networkattempts=7,
            branch=b"tip",
        )

        cmd.insert(0, hglib.HGPATH)

        proc = hglib.util.popen(cmd)
        out, err = proc.communicate()
        if proc.returncode:
            raise hglib.error.CommandError(cmd, proc.returncode, out, err)

        logger.info("mozilla-central cloned")

        git_user = get_secret("GIT_USER")
        git_password = get_secret("GIT_PASSWORD")

        repo_url = "https://github.com/marco-c/gecko-dev-wordified"
        repo_push_url = (
            f"https://{git_user}:{git_password}@github.com/marco-c/gecko-dev-wordified"
        )
        git_repo_path = os.path.basename(repo_url)

        retry(lambda: subprocess.run(["git", "clone", repo_url, git_repo_path],
                                     check=True))

        try:
            retry(lambda: subprocess.run(
                ["git", "pull", repo_url, "master"],
                cwd=git_repo_path,
                capture_output=True,
                check=True,
            ))
        except subprocess.CalledProcessError as e:
            # When the repo is empty.
            if b"Couldn't find remote ref master" in e.stdout:
                pass

        done = generator.generate(self.repo_dir, git_repo_path, limit=10000)

        with open("done", "w") as f:
            f.write(str(1 if done else 0))

        retry(lambda: subprocess.run(
            ["git", "config", "--global", "http.postBuffer", "12M"],
            check=True))
        retry(lambda: subprocess.run(["git", "push", repo_push_url, "master"],
                                     cwd=git_repo_path,
                                     check=True))
Ejemplo n.º 8
0
    def generate(self):
        db_path = os.path.join("data", self.git_repo_path)
        db.register(
            db_path,
            f"https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.microannotate_{self.git_repo_path}.latest/artifacts/public/",
            VERSION,
        )

        # TODO: Check the version again once we can run tasks for longer (https://bugzilla.mozilla.org/show_bug.cgi?id=1604175).
        is_old_version = False  # db.is_old_schema(db_path)

        with ThreadPoolExecutorResult(max_workers=2) as executor:
            cloner = executor.submit(repository.clone, self.repo_dir)
            cloner.add_done_callback(
                lambda future: logger.info("mozilla-central cloned"))

            git_user = get_secret("GIT_USER")
            git_password = get_secret("GIT_PASSWORD")

            repo_push_url = self.repo_url.replace(
                "https://", f"https://{git_user}:{git_password}@")

            if not is_old_version:
                executor.submit(self.clone_git_repo)
            else:
                executor.submit(self.init_git_repo)

        tenacity.retry(
            lambda: subprocess.run(
                ["git", "config", "--global", "http.postBuffer", "12M"],
                check=True),
            wait=tenacity.wait_fixed(30),
            stop=tenacity.stop_after_attempt(5),
        )()

        push_args = ["git", "push", repo_push_url, "master"]
        if is_old_version:
            push_args.append("--force")

        done = False
        while not done:
            done = generator.generate(
                self.repo_dir,
                self.git_repo_path,
                limit=COMMITS_STEP,
                tokenize=self.tokenize,
                remove_comments=self.remove_comments,
            )

            tenacity.retry(
                lambda: subprocess.run(
                    push_args, cwd=self.git_repo_path, check=True),
                wait=tenacity.wait_fixed(30),
                stop=tenacity.stop_after_attempt(5),
            )()
Ejemplo n.º 9
0
    def generate(self):
        repository.clone(self.repo_dir)

        logger.info("mozilla-central cloned")

        git_user = get_secret("GIT_USER")
        git_password = get_secret("GIT_PASSWORD")

        repo_push_url = self.repo_url.replace(
            "https://", f"https://{git_user}:{git_password}@")
        git_repo_path = os.path.basename(self.repo_url)

        retry(lambda: subprocess.run(
            ["git", "clone", self.repo_url, git_repo_path], check=True))

        try:
            retry(lambda: subprocess.run(
                ["git", "pull", self.repo_url, "master"],
                cwd=git_repo_path,
                capture_output=True,
                check=True,
            ))
        except subprocess.CalledProcessError as e:
            # When the repo is empty.
            if b"Couldn't find remote ref master" in e.stdout:
                pass

        retry(lambda: subprocess.run(
            ["git", "config", "--global", "http.postBuffer", "12M"],
            check=True))

        for i in range(STEPS):
            logger.info(f"Step {i} out of {STEPS}")

            done = generator.generate(
                self.repo_dir,
                git_repo_path,
                limit=TOTAL_COMMITS // STEPS,
                tokenize=self.tokenize,
                remove_comments=self.remove_comments,
            )

            with open("done", "w") as f:
                f.write(str(1 if done else 0))

            retry(lambda: subprocess.run(
                ["git", "push", repo_push_url, "master"],
                cwd=git_repo_path,
                check=True,
            ))

            if done:
                break
Ejemplo n.º 10
0
    def __init__(self, cache_root, repo_url, tokenize, remove_comments):
        git_user = get_secret("GIT_USER")
        git_password = get_secret("GIT_PASSWORD")
        self.repo_url = repo_url.replace(
            "https://", f"https://{git_user}:{git_password}@")
        self.git_repo_path = os.path.basename(self.repo_url)
        self.tokenize = tokenize
        self.remove_comments = remove_comments

        assert os.path.isdir(
            cache_root), f"Cache root {cache_root} is not a dir."
        self.repo_dir = os.path.join(cache_root, "mozilla-central")
Ejemplo n.º 11
0
    def retrieve_bugs(self):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            "Downloading bugs from {} to {}".format(
                two_years_and_six_months_ago, six_months_ago
            )
        )
        bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago)

        logger.info("Downloading labelled bugs")
        bug_ids = labels.get_all_bug_ids()
        bugzilla.download_bugs(bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        for i in range(3):
            bug_ids = bug_snapshot.get_inconsistencies()
            if len(bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(bug_ids)
            bugzilla.download_bugs(bug_ids)

        self.compress_file("data/bugs.json")
Ejemplo n.º 12
0
    def apply_phab(self, hg, diff_id):
        phabricator_api = PhabricatorAPI(
            api_key=get_secret("PHABRICATOR_TOKEN"),
            url=get_secret("PHABRICATOR_URL"))

        diffs = phabricator_api.search_diffs(diff_id=diff_id)
        assert len(diffs) == 1, f"No diff available for {diff_id}"
        diff = diffs[0]

        # Get the stack of patches
        base, patches = phabricator_api.load_patches_stack(hg, diff)
        assert len(patches) > 0, "No patches to apply"

        # Load all the diffs details with commits messages
        diffs = phabricator_api.search_diffs(diff_phid=[p[0] for p in patches],
                                             attachments={"commits": True})

        diffs_data = {}
        for diff in diffs:
            revision = phabricator_api.load_revision(
                rev_phid=diff["revisionPHID"])
            logger.info("Diff {} linked to Revision {}".format(
                diff["id"], revision["id"]))

            diffs_data[diff["phid"]] = {
                "commits": diff["attachments"]["commits"].get("commits", []),
                "revision": revision,
            }

        # First apply patches on local repo
        for diff_phid, patch in patches:
            diff_data = diffs_data.get(diff_phid)

            commits = diff_data["commits"]
            revision = diff_data["revision"]

            if commits and commits[0]["message"]:
                message = commits[0]["message"]
            else:
                message = revision["fields"]["title"]

            logger.info(f"Applying {diff_phid}")
            hg.import_(
                patches=io.BytesIO(patch.encode("utf-8")),
                message=message,
                user="******",
            )
Ejemplo n.º 13
0
    def generate(self):

        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
            cloner = executor.submit(repository.clone, self.repo_dir)
            cloner.add_done_callback(
                lambda future: logger.info("mozilla-central cloned"))

            git_user = get_secret("GIT_USER")
            git_password = get_secret("GIT_PASSWORD")

            repo_push_url = self.repo_url.replace(
                "https://", f"https://{git_user}:{git_password}@")
            git_repo_path = os.path.basename(self.repo_url)

            executor.submit(self.clone_git_repo, git_repo_path)

        retry(lambda: subprocess.run(
            ["git", "config", "--global", "http.postBuffer", "12M"],
            check=True))

        for i in range(STEPS):
            logger.info(f"Step {i} out of {STEPS}")

            done = generator.generate(
                self.repo_dir,
                git_repo_path,
                limit=TOTAL_COMMITS // STEPS,
                tokenize=self.tokenize,
                remove_comments=self.remove_comments,
            )

            with open("done", "w") as f:
                f.write(str(1 if done else 0))

            retry(lambda: subprocess.run(
                ["git", "push", repo_push_url, "master"],
                cwd=git_repo_path,
                check=True,
            ))

            if done:
                break
    def __init__(self, repo_dir: str) -> None:
        self.risk_bands = sorted(
            (
                parse_risk_band(risk_band)
                for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")
            ),
            key=lambda x: x[1],
        )

        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = cast(
            RegressorModel, RegressorModel.load(download_model("regressor"))
        )

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(
            get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")
        )
Ejemplo n.º 15
0
    def apply_phab(self, hg, phabricator_deployment, diff_id):
        if phabricator_deployment == PHAB_PROD:
            api_key = get_secret("PHABRICATOR_TOKEN")
            url = get_secret("PHABRICATOR_URL")
        else:
            api_key = get_secret("PHABRICATOR_DEV_TOKEN")
            url = get_secret("PHABRICATOR_DEV_URL")

        phabricator_api = PhabricatorAPI(api_key=api_key, url=url)

        # Get the stack of patches
        stack = phabricator_api.load_patches_stack(diff_id)
        assert len(stack) > 0, "No patches to apply"

        # Find the first unknown base revision
        needed_stack = []
        revisions = {}
        for patch in reversed(stack):
            needed_stack.insert(0, patch)

            # Stop as soon as a base revision is available
            if self.has_revision(hg, patch.base_revision):
                logger.info(
                    f"Stopping at diff {patch.id} and revision {patch.base_revision}"
                )
                break

        if not needed_stack:
            logger.info("All the patches are already applied")
            return

        # Load all the diff revisions
        diffs = phabricator_api.search_diffs(diff_phid=[p.phid for p in stack])
        revisions = {
            diff["phid"]:
            phabricator_api.load_revision(rev_phid=diff["revisionPHID"],
                                          attachments={"reviewers": True})
            for diff in diffs
        }

        # Update repo to base revision
        hg_base = needed_stack[0].base_revision
        if not self.has_revision(hg, hg_base):
            logger.warning(
                "Missing base revision {} from Phabricator".format(hg_base))
            hg_base = "tip"

        if hg_base:
            hg.update(rev=hg_base, clean=True)
            logger.info(f"Updated repo to {hg_base}")

            if self.git_repo_dir and hg_base != "tip":
                try:
                    self.git_base = tuple(
                        vcs_map.mercurial_to_git(self.git_repo_dir,
                                                 [hg_base]))[0]
                    subprocess.run(
                        [
                            "git", "checkout", "-b", "analysis_branch",
                            self.git_base
                        ],
                        check=True,
                        cwd=self.git_repo_dir,
                    )
                    logger.info(f"Updated git repo to {self.git_base}")
                except Exception as e:
                    logger.info(
                        f"Updating git repo to Mercurial {hg_base} failed: {e}"
                    )

        def load_user(phid):
            if phid.startswith("PHID-USER"):
                return phabricator_api.load_user(user_phid=phid)
            elif phid.startswith("PHID-PROJ"):
                # TODO: Support group reviewers somehow.
                logger.info(f"Skipping group reviewer {phid}")
            else:
                raise Exception(f"Unsupported reviewer {phid}")

        for patch in needed_stack:
            revision = revisions[patch.phid]

            message = "{}\n\n{}".format(revision["fields"]["title"],
                                        revision["fields"]["summary"])

            author_name = None
            author_email = None

            if patch.commits:
                author_name = patch.commits[0]["author"]["name"]
                author_email = patch.commits[0]["author"]["email"]

            if author_name is None:
                author = load_user(revision["fields"]["authorPHID"])
                author_name = author["fields"]["realName"]
                # XXX: Figure out a way to know the email address of the author.
                author_email = author["fields"]["username"]

            reviewers = list(
                filter(
                    None,
                    (load_user(reviewer["reviewerPHID"]) for reviewer in
                     revision["attachments"]["reviewers"]["reviewers"]),
                ))
            reviewers = set(reviewer["fields"]["username"]
                            for reviewer in reviewers)

            if len(reviewers):
                message = replace_reviewers(message, reviewers)

            logger.info(
                f"Applying {patch.phid} from revision {revision['id']}: {message}"
            )

            hg.import_(
                patches=io.BytesIO(patch.patch.encode("utf-8")),
                message=message.encode("utf-8"),
                user=f"{author_name} <{author_email}>".encode("utf-8"),
            )

            if self.git_repo_dir:
                patch_proc = subprocess.Popen(
                    ["patch", "-p1", "--no-backup-if-mismatch", "--force"],
                    stdin=subprocess.PIPE,
                    cwd=self.git_repo_dir,
                )
                patch_proc.communicate(patch.patch.encode("utf-8"))
                assert patch_proc.returncode == 0, "Failed to apply patch"

                subprocess.run(
                    [
                        "git",
                        "-c",
                        f"user.name={author_name}",
                        "-c",
                        f"user.email={author_email}",
                        "commit",
                        "-am",
                        message,
                    ],
                    check=True,
                    cwd=self.git_repo_dir,
                )
Ejemplo n.º 16
0
    def retrieve_bugs(self, limit=None):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids(
            {"f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date()}
        )
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(
            two_years_and_six_months_ago, six_months_ago
        )
        if limit:
            timespan_ids = timespan_ids[:limit]
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        if limit:
            labelled_bug_ids = labelled_bug_ids[:limit]
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
        # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped.
        if limit is None:
            assert db.download(repository.COMMITS_DB)

        # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
        start_date = datetime.now() - relativedelta(years=3)
        commit_bug_ids = [
            commit["bug_id"]
            for commit in repository.get_commits()
            if commit["bug_id"]
            and dateutil.parser.parse(commit["pushdate"]) >= start_date
        ]
        if limit:
            commit_bug_ids = commit_bug_ids[-limit:]
        logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which are regressions and bugs which caused regressions (useful for the regressor model).
        regressed_by_bug_ids = sum(
            (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()),
            [],
        )
        if limit:
            regressed_by_bug_ids = regressed_by_bug_ids[-limit:]
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        all_ids = (
            timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids
        )
        all_ids_set = set(all_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(
            lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set
        )

        bugzilla.download_bugs(all_ids)

        # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs).
        regressed_by_bug_ids = sum(
            (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()),
            [],
        )
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        bugzilla.download_bugs(regressed_by_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs(include_invalid=True)
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        zstd_compress("data/bugs.json")
Ejemplo n.º 17
0
def get_token() -> str:
    return get_secret("GITHUB_TOKEN")
Ejemplo n.º 18
0
    def apply_phab(self, hg, diff_id):
        def has_revision(revision):
            if not revision:
                return False
            try:
                hg.identify(revision)
                return True
            except hglib.error.CommandError:
                return False

        phabricator_api = PhabricatorAPI(
            api_key=get_secret("PHABRICATOR_TOKEN"),
            url=get_secret("PHABRICATOR_URL"))

        # Get the stack of patches
        stack = phabricator_api.load_patches_stack(diff_id)
        assert len(stack) > 0, "No patches to apply"

        # Find the first unknown base revision
        needed_stack = []
        revisions = {}
        for patch in reversed(stack):
            needed_stack.insert(0, patch)

            # Stop as soon as a base revision is available
            if has_revision(patch.base_revision):
                logger.info(
                    f"Stopping at diff {patch.id} and revision {patch.base_revision}"
                )
                break

        if not needed_stack:
            logger.info("All the patches are already applied")
            return

        # Load all the diff revisions
        diffs = phabricator_api.search_diffs(diff_phid=[p.phid for p in stack])
        revisions = {
            diff["phid"]:
            phabricator_api.load_revision(rev_phid=diff["revisionPHID"],
                                          attachments={"reviewers": True})
            for diff in diffs
        }

        # Update repo to base revision
        hg_base = needed_stack[0].base_revision
        if not has_revision(hg_base):
            logger.warning(
                "Missing base revision {} from Phabricator".format(hg_base))
            hg_base = "tip"

        if hg_base:
            hg.update(rev=hg_base, clean=True)
            logger.info(f"Updated repo to {hg_base}")

            try:
                self.git_base = vcs_map.mercurial_to_git(hg_base)
                subprocess.run(
                    [
                        "git", "checkout", "-b", "analysis_branch",
                        self.git_base
                    ],
                    check=True,
                    cwd=self.git_repo_dir,
                )
                logger.info(f"Updated git repo to {self.git_base}")
            except Exception as e:
                logger.info(
                    f"Updating git repo to Mercurial {hg_base} failed: {e}")

        def load_user(phid):
            if phid.startswith("PHID-USER"):
                return phabricator_api.load_user(user_phid=phid)
            elif phid.startswith("PHID-PROJ"):
                # TODO: Support group reviewers somehow.
                logger.info(f"Skipping group reviewer {phid}")
            else:
                raise Exception(f"Unsupported reviewer {phid}")

        for patch in needed_stack:
            revision = revisions[patch.phid]

            message = "{}\n\n{}".format(revision["fields"]["title"],
                                        revision["fields"]["summary"])

            author_name = None
            author_email = None

            if patch.commits:
                author_name = patch.commits[0]["author"]["name"]
                author_email = patch.commits[0]["author"]["email"]

            if author_name is None:
                author = load_user(revision["fields"]["authorPHID"])
                author_name = author["fields"]["realName"]
                # XXX: Figure out a way to know the email address of the author.
                author_email = author["fields"]["username"]

            reviewers = list(
                filter(
                    None,
                    (load_user(reviewer["reviewerPHID"]) for reviewer in
                     revision["attachments"]["reviewers"]["reviewers"]),
                ))
            reviewers = set(reviewer["fields"]["username"]
                            for reviewer in reviewers)

            if len(reviewers):
                message = replace_reviewers(message, reviewers)

            logger.info(
                f"Applying {patch.phid} from revision {revision['id']}: {message}"
            )

            hg.import_(
                patches=io.BytesIO(patch.patch.encode("utf-8")),
                message=message.encode("utf-8"),
                user=f"{author_name} <{author_email}>".encode("utf-8"),
            )

            with tempfile.TemporaryDirectory() as tmpdirname:
                temp_file = os.path.join(tmpdirname, "temp.patch")
                with open(temp_file, "w") as f:
                    f.write(patch.patch)

                subprocess.run(
                    ["git", "apply", "--3way", temp_file],
                    check=True,
                    cwd=self.git_repo_dir,
                )
                subprocess.run(
                    [
                        "git",
                        "-c",
                        f"user.name={author_name}",
                        "-c",
                        f"user.email={author_email}",
                        "commit",
                        "-am",
                        message,
                    ],
                    check=True,
                    cwd=self.git_repo_dir,
                )
Ejemplo n.º 19
0
    def classify(self, diff_id):
        self.update_commit_db()

        with hglib.open(self.repo_dir) as hg:
            self.apply_phab(hg, diff_id)

            patch_rev = hg.log(revrange="not public()")[0].node

            # Analyze patch.
            commits = repository.download_commits(
                self.repo_dir, rev_start=patch_rev.decode("utf-8"), save=False
            )

        # We use "clean" (or "dirty") commits as the background dataset for feature importance.
        # This way, we can see the features which are most important in differentiating
        # the current commit from the "clean" (or "dirty") commits.

        if not self.use_test_history:
            probs, importance = self.model.classify(
                commits[-1],
                probabilities=True,
                importances=True,
                background_dataset=lambda v: self.X[self.y != v],
                importance_cutoff=0.05,
            )

            self.generate_feature_importance_data(probs, importance)

            with open("probs.json", "w") as f:
                json.dump(probs[0].tolist(), f)

            if self.model_name == "regressor" and self.method_defect_predictor_dir:
                self.classify_methods(commits[-1])
        else:
            testfailure_probs = self.testfailure_model.classify(
                commits[-1], probabilities=True
            )

            logger.info(f"Test failure risk: {testfailure_probs[0][1]}")

            commit_data = commit_features.merge_commits(commits)

            push_num = self.past_failures_data["push_num"]

            # XXX: Consider using mozilla-central built-in rules to filter some of these out, e.g. SCHEDULES.
            # XXX: Consider using the runnable jobs artifact from the Gecko Decision task.
            all_tasks = self.past_failures_data["all_tasks"]

            # XXX: For now, only restrict to test-linux64 tasks.
            all_tasks = [
                t
                for t in all_tasks
                if t.startswith("test-linux64/") and "test-verify" not in t
            ]

            commit_tests = []
            for data in test_scheduling.generate_data(
                self.past_failures_data, commit_data, push_num, all_tasks, [], []
            ):
                if not data["name"].startswith("test-"):
                    continue

                commit_test = commit_data.copy()
                commit_test["test_job"] = data
                commit_tests.append(commit_test)

            probs = self.model.classify(commit_tests, probabilities=True)
            selected_indexes = np.argwhere(
                probs[:, 1] > float(get_secret("TEST_SELECTION_CONFIDENCE_THRESHOLD"))
            )[:, 0]
            selected_tasks = [
                commit_tests[i]["test_job"]["name"] for i in selected_indexes
            ]

            with open("failure_risk", "w") as f:
                f.write(
                    "1"
                    if testfailure_probs[0][1]
                    > float(get_secret("TEST_FAILURE_CONFIDENCE_THRESHOLD"))
                    else "0"
                )

            # It isn't worth running the build associated to the tests, if we only run three test tasks.
            if len(selected_tasks) < 3:
                selected_tasks = []

            with open("selected_tasks", "w") as f:
                f.writelines(f"{selected_task}\n" for selected_task in selected_tasks)
Ejemplo n.º 20
0
    def classify_test_select(self, commits, runnable_jobs_path):
        testfailure_probs = self.testfailure_model.classify(commits[-1],
                                                            probabilities=True)

        logger.info(f"Test failure risk: {testfailure_probs[0][1]}")

        commit_data = commit_features.merge_commits(commits)

        push_num = self.past_failures_data["push_num"]

        # XXX: Consider using mozilla-central built-in rules to filter some of these out, e.g. SCHEDULES.
        all_tasks = self.past_failures_data["all_runnables"]

        if not runnable_jobs_path:
            runnable_jobs = {task for task in all_tasks}
        elif runnable_jobs_path.startswith("http"):
            r = requests.get(runnable_jobs_path)
            r.raise_for_status()
            runnable_jobs = r.json()
        else:
            with open(runnable_jobs_path, "r") as f:
                runnable_jobs = json.load(f)

        # XXX: For now, only restrict to linux64 test tasks.
        all_tasks = [
            t for t in all_tasks if t.startswith("test-linux1804-64/")
        ]

        # XXX: Remove tasks which are not in runnable jobs right away, so we avoid classifying them.

        commit_tests = []
        for data in test_scheduling.generate_data(self.past_failures_data,
                                                  commit_data, push_num,
                                                  all_tasks, [], []):
            if not data["name"].startswith("test-"):
                continue

            commit_test = commit_data.copy()
            commit_test["test_job"] = data
            commit_tests.append(commit_test)

        probs = self.model.classify(commit_tests, probabilities=True)
        selected_indexes = np.argwhere(probs[:, 1] > float(
            get_secret("TEST_SELECTION_CONFIDENCE_THRESHOLD")))[:, 0]
        selected_tasks = [
            commit_tests[i]["test_job"]["name"] for i in selected_indexes
        ]

        with open("failure_risk", "w") as f:
            f.write("1" if testfailure_probs[0][1] > float(
                get_secret("TEST_FAILURE_CONFIDENCE_THRESHOLD")) else "0")

        # This should be kept in sync with the test scheduling history retriever script.
        cleaned_selected_tasks = []
        for selected_task in selected_tasks:
            if (selected_task.startswith("test-linux64")
                    and selected_task not in runnable_jobs):
                selected_task = selected_task.replace("test-linux64-",
                                                      "test-linux1804-64-")

            if (selected_task.startswith("test-linux1804-64-")
                    and selected_task not in runnable_jobs):
                selected_task = selected_task.replace("test-linux1804-64-",
                                                      "test-linux64-")

            if selected_task in runnable_jobs:
                cleaned_selected_tasks.append(selected_task)

        # It isn't worth running the build associated to the tests, if we only run three test tasks.
        if len(cleaned_selected_tasks) < 3:
            cleaned_selected_tasks = []

        with open("selected_tasks", "w") as f:
            f.writelines(f"{selected_task}\n"
                         for selected_task in cleaned_selected_tasks)
Ejemplo n.º 21
0
    def apply_phab(self, hg, diff_id):
        def has_revision(revision):
            if not revision:
                return False
            try:
                hg.identify(revision)
                return True
            except hglib.error.CommandError:
                return False

        phabricator_api = PhabricatorAPI(
            api_key=get_secret("PHABRICATOR_TOKEN"), url=get_secret("PHABRICATOR_URL")
        )

        # Get the stack of patches
        stack = phabricator_api.load_patches_stack(diff_id)
        assert len(stack) > 0, "No patches to apply"

        # Find the first unknown base revision
        needed_stack = []
        revisions = {}
        for patch in reversed(stack):
            needed_stack.insert(0, patch)

            # Stop as soon as a base revision is available
            if has_revision(patch.base_revision):
                logger.info(
                    f"Stopping at diff {patch.id} and revision {patch.base_revision}"
                )
                break

        if not needed_stack:
            logger.info("All the patches are already applied")
            return

        # Load all the diff revisions
        diffs = phabricator_api.search_diffs(diff_phid=[p.phid for p in stack])
        revisions = {
            diff["phid"]: phabricator_api.load_revision(rev_phid=diff["revisionPHID"])
            for diff in diffs
        }

        # Update repo to base revision
        hg_base = needed_stack[0].base_revision
        if not has_revision(hg_base):
            logger.warning("Missing base revision {} from Phabricator".format(hg_base))
            hg_base = "tip"

        if hg_base:
            hg.update(rev=hg_base, clean=True)
            logger.info(f"Updated repo to {hg_base}")

            try:
                self.git_base = vcs_map.mercurial_to_git(hg_base)
                subprocess.run(
                    ["git", "checkout", "-b", "analysis_branch", self.git_base],
                    check=True,
                    cwd=self.git_repo_dir,
                )
                logger.info(f"Updated git repo to {self.git_base}")
            except Exception as e:
                logger.info(f"Updating git repo to Mercurial {hg_base} failed: {e}")

        for patch in needed_stack:
            revision = revisions[patch.phid]

            if patch.commits:
                message = patch.commits[0]["message"]
                author_name = patch.commits[0]["author"]["name"]
                author_email = patch.commits[0]["author"]["email"]
            else:
                message = revision["fields"]["title"]
                author_name = "bugbug"
                author_email = "*****@*****.**"

            logger.info(
                f"Applying {patch.phid} from revision {revision['id']}: {message}"
            )

            hg.import_(
                patches=io.BytesIO(patch.patch.encode("utf-8")),
                message=message.encode("utf-8"),
                user=f"{author_name} <{author_email}>".encode("utf-8"),
            )

            with tempfile.TemporaryDirectory() as tmpdirname:
                temp_file = os.path.join(tmpdirname, "temp.patch")
                with open(temp_file, "w") as f:
                    f.write(patch.patch)

                subprocess.run(
                    ["git", "apply", "--3way", temp_file],
                    check=True,
                    cwd=self.git_repo_dir,
                )
                subprocess.run(
                    [
                        "git",
                        "-c",
                        f"user.name={author_name}",
                        "-c",
                        f"user.email={author_email}",
                        "commit",
                        "-am",
                        message,
                    ],
                    check=True,
                    cwd=self.git_repo_dir,
                )
Ejemplo n.º 22
0
    def retrieve_bugs(self, limit: int = None) -> None:
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = set(
            bugzilla.get_ids({
                "f1": "delta_ts",
                "o1": "greaterthaneq",
                "v1": last_modified.date()
            }))
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        all_components = bugzilla.get_product_component_count(9999)

        deleted_component_ids = set(
            bug["id"] for bug in bugzilla.get_bugs() if "{}::{}".format(
                bug["product"], bug["component"]) not in all_components)
        logger.info(
            f"{len(deleted_component_ids)} bugs belonging to deleted components"
        )
        changed_ids |= deleted_component_ids

        # Get IDs of bugs between (two years and six months ago) and now.
        two_years_and_six_months_ago = datetime.utcnow() - relativedelta(
            years=2, months=6)
        logger.info(f"Retrieving bug IDs since {two_years_and_six_months_ago}")
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago)
        if limit:
            timespan_ids = timespan_ids[-limit:]
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        if limit:
            labelled_bug_ids = labelled_bug_ids[-limit:]
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
        # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped.
        if limit is None:
            assert db.download(repository.COMMITS_DB)

        # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
        start_date = datetime.now() - relativedelta(years=3)
        commit_bug_ids = list(
            set(commit["bug_id"] for commit in repository.get_commits()
                if commit["bug_id"]
                and dateutil.parser.parse(commit["pushdate"]) >= start_date))
        if limit:
            commit_bug_ids = commit_bug_ids[-limit:]
        logger.info(
            f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which are regressions, bugs which caused regressions (useful for the regressor model),
        # and blocked bugs.
        regression_related_ids: List[int] = list(
            set(
                sum(
                    (bug["regressed_by"] + bug["regressions"] + bug["blocks"]
                     for bug in bugzilla.get_bugs()),
                    [],
                )))
        if limit:
            regression_related_ids = regression_related_ids[-limit:]
        logger.info(
            f"{len(regression_related_ids)} bugs which caused regressions fixed by commits."
        )

        # Get IDs of bugs linked to intermittent failures.
        test_failure_bug_ids = [
            item["bug_id"] for item in test_scheduling.get_failure_bugs(
                two_years_and_six_months_ago, datetime.utcnow())
        ]
        if limit:
            test_failure_bug_ids = test_failure_bug_ids[-limit:]
        logger.info(f"{len(test_failure_bug_ids)} bugs about test failures.")

        all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids +
                   regression_related_ids + test_failure_bug_ids)
        all_ids_set = set(all_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"]
                             not in all_ids_set)

        new_bugs = bugzilla.download_bugs(all_ids)

        # Get regression_related_ids again (the set could have changed after downloading new bugs).
        for i in range(7):
            regression_related_ids = list(
                set(
                    sum(
                        (bug["regressed_by"] + bug["regressions"] +
                         bug["blocks"] for bug in new_bugs),
                        [],
                    )))
            logger.info(
                f"{len(regression_related_ids)} bugs which caused regressions fixed by commits."
            )
            if limit:
                regression_related_ids = regression_related_ids[-limit:]

            # If we got all bugs we needed, break.
            if set(regression_related_ids).issubset(all_ids):
                break

            new_bugs = bugzilla.download_bugs(regression_related_ids)

        # Try to re-download inconsistent bugs, up to twice.
        inconsistent_bugs = bugzilla.get_bugs(include_invalid=True)
        for i in range(2):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        # TODO: Figure out why.
        missing_history_bug_ids = {
            bug["id"]
            for bug in bugzilla.get_bugs() if "history" not in bug
        }
        bugzilla.delete_bugs(lambda bug: bug["id"] in missing_history_bug_ids)
        logger.info(
            f"Deleted {len(missing_history_bug_ids)} bugs as we couldn't retrieve their history"
        )

        zstd_compress(bugzilla.BUGS_DB)
Ejemplo n.º 23
0
    def apply_phab(self, hg, diff_id):
        def has_revision(revision):
            if not revision:
                return False
            try:
                hg.identify(revision)
                return True
            except hglib.error.CommandError:
                return False

        phabricator_api = PhabricatorAPI(
            api_key=get_secret("PHABRICATOR_TOKEN"),
            url=get_secret("PHABRICATOR_URL"))

        # Get the stack of patches
        stack = phabricator_api.load_patches_stack(diff_id)
        assert len(stack) > 0, "No patches to apply"

        # Find the first unknown base revision
        needed_stack = []
        revisions = {}
        for patch in reversed(stack):
            needed_stack.insert(0, patch)

            # Stop as soon as a base revision is available
            if has_revision(patch.base_revision):
                logger.info(
                    f"Stopping at diff {patch.id} and revision {patch.base_revision}"
                )
                break

        if not needed_stack:
            logger.info("All the patches are already applied")
            return

        # Load all the diff revisions
        diffs = phabricator_api.search_diffs(diff_phid=[p.phid for p in stack])
        revisions = {
            diff["phid"]:
            phabricator_api.load_revision(rev_phid=diff["revisionPHID"])
            for diff in diffs
        }

        # Update repo to base revision
        hg_base = needed_stack[0].base_revision
        if hg_base:
            hg.update(rev=hg_base, clean=True)
            logger.info(f"Updated repo to {hg_base}")

        for patch in needed_stack:
            revision = revisions[patch.phid]

            if patch.commits:
                message = patch.commits[0]["message"]
            else:
                message = revision["fields"]["title"]

            logger.info(
                f"Applying {patch.phid} from revision {revision['id']}: {message}"
            )

            hg.import_(
                patches=io.BytesIO(patch.patch.encode("utf-8")),
                message=message,
                user="******",
            )
Ejemplo n.º 24
0
    def retrieve_bugs(self):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download_version(bugzilla.BUGS_DB)
        if not db.is_old_version(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids({
            "f1": "delta_ts",
            "o1": "greaterthaneq",
            "v1": last_modified.date()
        })
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago,
                                                six_months_ago)
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        all_ids = set(timespan_ids + labelled_bug_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(
            lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids)

        bugzilla.download_bugs(timespan_ids + labelled_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs()
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        self.compress_file("data/bugs.json")
    def __init__(self, repo_dir: str) -> None:
        self.risk_bands = sorted(
            (parse_risk_band(risk_band)
             for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")),
            key=lambda x: x[1],
        )

        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        # Some commits that were already in the DB from the previous run might need
        # to be updated (e.g. coverage information).
        repository.update_commits()

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = cast(
            RegressorModel, RegressorModel.load(download_model("regressor")))

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))

        self.path_to_component = repository.get_component_mapping()

        self.past_regressions_by = {}
        self.past_fixed_bugs_by = {}
        self.past_regression_blocked_bugs_by = {}
        self.past_fixed_bug_blocked_bugs_by = {}

        for dimension in ["component", "directory", "file", "function"]:
            self.past_regressions_by[dimension] = _download_past_bugs(
                PAST_REGRESSIONS_BY_URL.format(dimension=dimension))
            self.past_fixed_bugs_by[dimension] = _download_past_bugs(
                PAST_FIXED_BUGS_BY_URL.format(dimension=dimension))
            self.past_regression_blocked_bugs_by[
                dimension] = _download_past_bugs(
                    PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format(
                        dimension=dimension))
            self.past_fixed_bug_blocked_bugs_by[
                dimension] = _download_past_bugs(
                    PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format(
                        dimension=dimension))
Ejemplo n.º 26
0
    def classify(self, diff_id):
        self.update_commit_db()

        with hglib.open(self.repo_dir) as hg:
            self.apply_phab(hg, diff_id)

            patch_rev = hg.log(revrange="not public()")[0].node

            # Analyze patch.
            commits = repository.download_commits(
                self.repo_dir, rev_start=patch_rev.decode("utf-8"), save=False
            )

        # We use "clean" (or "dirty") commits as the background dataset for feature importance.
        # This way, we can see the features which are most important in differentiating
        # the current commit from the "clean" (or "dirty") commits.

        if not self.use_test_history:
            probs, importance = self.model.classify(
                commits[-1],
                probabilities=True,
                importances=True,
                background_dataset=lambda v: self.X[self.y != v],
                importance_cutoff=0.05,
            )

            self.generate_feature_importance_data(probs, importance)

            with open("probs.json", "w") as f:
                json.dump(probs[0].tolist(), f)

            if self.model_name == "regressor" and self.method_defect_predictor_dir:
                self.classify_methods()
        else:
            backout_probs = self.backout_model.classify(commits[-1], probabilities=True)

            logger.info(f"Backout risk: {backout_probs[0][1]}")

            commit_data = commit_features.merge_commits(commits)

            push_num = self.past_failures_data["push_num"]

            # XXX: Consider using mozilla-central built-in rules to filter some of these out, e.g. SCHEDULES.
            # XXX: Consider using the runnable jobs artifact from the Gecko Decision task.
            all_tasks = self.past_failures_data["all_tasks"]

            selected_tasks = []
            # TODO: Classify multiple commit/test at the same time.
            for data in test_scheduling.generate_data(
                self.past_failures_data, commit_data, push_num, all_tasks, [], []
            ):
                if not data["name"].startswith("test-"):
                    continue

                commit_data["test_job"] = data

                probs = self.model.classify(commit_data, probabilities=True)

                if probs[0][1] > float(
                    get_secret("TEST_SELECTION_CONFIDENCE_THRESHOLD")
                ):
                    selected_tasks.append(data["name"])

            with open("failure_risk", "w") as f:
                f.write(
                    "1"
                    if backout_probs[0][1]
                    > float(get_secret("TEST_FAILURE_CONFIDENCE_THRESHOLD"))
                    else "0"
                )

            with open("selected_tasks", "w") as f:
                f.writelines(f"{selected_task}\n" for selected_task in selected_tasks)