def get_landed_and_filed_since(self, days: int) -> List[int]:
        since = datetime.utcnow() - timedelta(days=days)

        commits = [
            commit for commit in repository.get_commits()
            if dateutil.parser.parse(commit["pushdate"]) >= since
            and commit["bug_id"]
        ]

        logger.info(f"Retrieving bug IDs since {days} days ago")
        timespan_ids = bugzilla.get_ids_between(since, datetime.utcnow())
        bugzilla.download_bugs(timespan_ids)

        bug_ids = set(commit["bug_id"] for commit in commits)
        bug_ids.update(bug["id"] for bug in bugzilla.get_bugs()
                       if dateutil.parser.parse(bug["creation_time"]).replace(
                           tzinfo=None) >= since and bug["resolution"] not in [
                               "INVALID",
                               "WONTFIX",
                               "INACTIVE",
                               "DUPLICATE",
                               "INCOMPLETE",
                               "MOVED",
                               "WORKSFORME",
                           ])

        return list(bug_ids)
 def get_regressors_of(self, bug_ids: List[int]) -> List[int]:
     bugzilla.download_bugs(bug_ids)
     return sum(
         (bug["regressed_by"]
          for bug in bugzilla.get_bugs() if bug["id"] in bug_ids),
         [],
     )
예제 #3
0
    def retrieve_bugs(self):
        bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN])

        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info('Downloading bugs from {} to {}'.format(
            two_years_and_six_months_ago, six_months_ago))
        bugzilla.download_bugs_between(two_years_and_six_months_ago,
                                       six_months_ago)

        logger.info('Downloading labelled bugs')
        bug_ids = labels.get_all_bug_ids()
        bugzilla.download_bugs(bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        for i in range(3):
            bug_ids = bug_snapshot.get_inconsistencies()
            if len(bug_ids) == 0:
                break

            logger.info(
                f'Re-downloading {len(bug_ids)} bugs, as they were inconsistent'
            )
            bugzilla.delete_bugs(bug_ids)
            bugzilla.download_bugs(bug_ids)

        self.compress_file('data/bugs.json')
예제 #4
0
    def retrieve_bugs(self):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            "Downloading bugs from {} to {}".format(
                two_years_and_six_months_ago, six_months_ago
            )
        )
        bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago)

        logger.info("Downloading labelled bugs")
        bug_ids = labels.get_all_bug_ids()
        bugzilla.download_bugs(bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        for i in range(3):
            bug_ids = bug_snapshot.get_inconsistencies()
            if len(bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(bug_ids)
            bugzilla.download_bugs(bug_ids)

        self.compress_file("data/bugs.json")
 def get_blocking_of(self,
                     bug_ids: List[int],
                     meta_only: bool = False) -> Dict[int, List[int]]:
     bugzilla.download_bugs(bug_ids)
     bug_map = {bug["id"]: bug for bug in bugzilla.get_bugs()}
     return {
         bug_id: bugzilla.find_blocking(bug_map, bug_map[bug_id])
         for bug_id in bug_ids
         if not meta_only or "meta" in bug_map[bug_id]["keywords"]
     }
예제 #6
0
    def retrieve_bugs(self):
        bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN])

        six_months_ago = datetime.utcnow() - timedelta(182)
        two_years_and_six_months_ago = six_months_ago - timedelta(365)
        bugzilla.download_bugs_between(two_years_and_six_months_ago,
                                       six_months_ago)

        bug_ids = labels.get_all_bug_ids()
        bugzilla.download_bugs(bug_ids)

        self.compress_file('data/bugs.json')
예제 #7
0
    def retrieve_bugs(self):
        bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN])

        six_months_ago = datetime.utcnow() - timedelta(182)
        two_years_and_six_months_ago = six_months_ago - timedelta(365)
        logger.info('Downloading bugs from {} to {}'.format(
            two_years_and_six_months_ago, six_months_ago))
        bugzilla.download_bugs_between(two_years_and_six_months_ago,
                                       six_months_ago)

        logger.info('Downloading labelled bugs')
        bug_ids = labels.get_all_bug_ids()
        bugzilla.download_bugs(bug_ids)

        self.compress_file('data/bugs.json')
    def go(self, days: int) -> None:
        bugs = self.get_landed_and_filed_since(days)

        meta_bugs = self.get_meta_bugs(days)

        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Deleting bugs modified since the last run on {last_modified}")
        changed_ids = bugzilla.get_ids({
            "f1": "delta_ts",
            "o1": "greaterthaneq",
            "v1": last_modified.date()
        })
        bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids)

        bugs = list(set(bugs))

        test_infos = self.retrieve_test_info(days)
        test_info_bugs: List[int] = [
            bug["id"] for test_info in test_infos.values()
            for bug in test_info["bugs"]
        ]

        logger.info("Download bugs of interest...")
        bugzilla.download_bugs(bugs + test_info_bugs + [FUZZING_METABUG_ID] +
                               meta_bugs)

        logger.info(f"{len(bugs)} bugs to analyze.")

        bugs_set = set(bugs + test_info_bugs + meta_bugs)

        bug_map = {}
        regressor_bug_ids = set()
        for bug in bugzilla.get_bugs():
            # Only add to the map bugs we are interested in, and bugs that block other bugs (needed for the bug_to_types call).
            if bug["id"] in bugs_set or len(bug["blocks"]) > 0:
                bug_map[bug["id"]] = bug

            if len(bug["regressions"]) > 0:
                regressor_bug_ids.add(bug["id"])

        self.generate_landings_by_date(bug_map, regressor_bug_ids, bugs,
                                       self.get_blocking_of(meta_bugs))

        self.generate_component_connections(bug_map, bugs)

        self.generate_component_test_stats(bug_map, test_infos)
    def go(self, days: int) -> None:
        bugs = self.get_landed_and_filed_since(days)

        meta_bugs = self.get_blocking_of(self.get_meta_bugs(days))
        bugs += meta_bugs.keys()
        bugs += sum(meta_bugs.values(), [])

        bugs = list(set(bugs))

        test_infos = self.retrieve_test_info(days)
        test_info_bugs: List[int] = [
            bug["id"] for test_info in test_infos.values()
            for bug in test_info["bugs"]
        ]

        logger.info("Download bugs of interest...")
        bugzilla.download_bugs(bugs + test_info_bugs)

        logger.info(f"{len(bugs)} bugs to analyze.")

        bugs_set = set(bugs + test_info_bugs)

        bug_map = {}
        regressor_bug_ids = set()
        for bug in bugzilla.get_bugs():
            # Only add to the map bugs we are interested in, and bugs that block other bugs (needed for the bug_to_types call).
            if bug["id"] in bugs_set or len(bug["blocks"]) > 0:
                bug_map[bug["id"]] = bug

            if len(bug["regressions"]) > 0:
                regressor_bug_ids.add(bug["id"])

        self.generate_landings_by_date(bug_map, regressor_bug_ids, bugs,
                                       meta_bugs)

        self.generate_component_connections(bug_map, bugs)

        self.generate_component_test_stats(bug_map, test_infos)
    def go(self,
           bugs: List[int],
           meta_bugs: Optional[List[int]] = None) -> None:
        if meta_bugs is not None:
            bugs += meta_bugs + self.get_blocking_of(meta_bugs)

        logger.info("Download bugs of interest...")
        bugzilla.download_bugs(bugs)

        component_team_mapping = bugzilla.get_component_team_mapping()

        bugs_set = set(bugs)

        commits = [
            commit for commit in repository.get_commits()
            if commit["bug_id"] in bugs_set
        ]
        commit_map = {commit["node"]: commit for commit in commits}
        hash_to_rev = {commit["node"]: i for i, commit in enumerate(commits)}

        logger.info(f"{len(commits)} commits to analyze.")

        logger.info(f"{len(bugs_set)} bugs to analyze.")

        bug_map = {}
        regressor_bug_ids = set()
        for bug in bugzilla.get_bugs():
            bug_map[bug["id"]] = bug

            if len(bug["regressions"]) > 0:
                regressor_bug_ids.add(bug["id"])

        logger.info("Retrieve Phabricator revisions linked to commits...")
        revision_ids = set(
            filter(None,
                   (repository.get_revision_id(commit) for commit in commits)))

        logger.info("Download revisions of interest...")
        phabricator.download_revisions(revision_ids)

        revision_map = {
            revision["id"]: revision
            for revision in phabricator.get_revisions()
            if revision["id"] in revision_ids
        }

        if meta_bugs is not None:
            blocker_to_meta = collections.defaultdict(set)
            for meta_bug in meta_bugs:
                if meta_bug not in bug_map:
                    continue

                for blocker_bug_id in bugzilla.find_blocking(
                        bug_map, bug_map[meta_bug]):
                    blocker_to_meta[blocker_bug_id].add(meta_bug)

        def _download_past_bugs(url: str) -> dict:
            path = os.path.join("data", os.path.basename(url)[:-4])
            download_check_etag(url, path=f"{path}.zst")
            zstd_decompress(path)
            assert os.path.exists(path)
            with open(path, "r") as f:
                return json.load(f)

        past_regressions_by = {}
        past_fixed_bugs_by = {}
        past_regression_blocked_bugs_by = {}
        past_fixed_bug_blocked_bugs_by = {}

        for dimension in ["component", "directory", "file", "function"]:
            past_regressions_by[dimension] = _download_past_bugs(
                PAST_REGRESSIONS_BY_URL.format(dimension=dimension))
            past_fixed_bugs_by[dimension] = _download_past_bugs(
                PAST_FIXED_BUGS_BY_URL.format(dimension=dimension))
            past_regression_blocked_bugs_by[dimension] = _download_past_bugs(
                PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format(
                    dimension=dimension))
            past_fixed_bug_blocked_bugs_by[dimension] = _download_past_bugs(
                PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format(dimension=dimension))

        path_to_component = repository.get_component_mapping()

        def get_full_component(bug):
            return "{}::{}".format(bug["product"], bug["component"])

        def histogram(components: List[str]) -> Dict[str, float]:
            counter = collections.Counter(components)
            return {
                component: count / len(components)
                for component, count in counter.most_common()
            }

        def component_histogram(bugs: List[dict]) -> Dict[str, float]:
            return histogram([bug["component"] for bug in bugs])

        def find_risk_band(risk: float) -> str:
            for name, start, end in self.risk_bands:
                if start <= risk <= end:
                    return name

            assert False

        def get_prev_bugs(past_bugs_by: dict,
                          commit: repository.CommitDict,
                          component: str = None) -> List[dict]:
            paths = [
                path for path in commit["files"]
                if component is None or (path.encode(
                    "utf-8") in path_to_component and path_to_component[
                        path.encode("utf-8")] == component.encode("utf-8"))
            ]

            past_bugs = []

            for path, f_group in commit["functions"].items():
                if path not in paths:
                    continue

                if path not in past_bugs_by["function"]:
                    continue

                found = False
                for f in f_group:
                    if f[0] not in past_bugs_by["function"][path]:
                        continue

                    found = True
                    past_bugs += past_bugs_by["function"][path][f[0]]

                if found:
                    paths.remove(path)

            for path in paths:
                if path in past_bugs_by["file"]:
                    past_bugs += past_bugs_by["file"][path]
                    paths.remove(path)

            for path, directories in zip(paths,
                                         repository.get_directories(paths)):
                found = False
                for directory in directories:
                    if directory in past_bugs_by["directory"]:
                        found = True
                        past_bugs += past_bugs_by["directory"][directory]

                if found:
                    paths.remove(path)

            components = [
                path_to_component[path.encode("utf-8")].tobytes().decode(
                    "utf-8") for path in paths
                if path.encode("utf-8") in path_to_component
            ]

            for component in components:
                if component in past_bugs_by["component"]:
                    past_bugs += past_bugs_by["component"][component]

            return past_bugs

        def get_prev_bugs_stats(
            commit_group: dict,
            commit_list: List[repository.CommitDict],
            component: str = None,
        ) -> None:
            # Find previous regressions occurred in the same files as those touched by these commits.
            # And find previous bugs that were fixed by touching the same files as these commits.
            # And find previous bugs that were blocked by regressions occurred in the same files as those touched by these commits.
            # And find previous bugs that were blocked by bugs that were fixed by touching the same files as those touched by these commits.
            prev_regressions: List[Dict[str, Any]] = sum(
                (get_prev_bugs(past_regressions_by, commit, component)
                 for commit in commit_list),
                [],
            )
            prev_fixed_bugs: List[Dict[str, Any]] = sum(
                (get_prev_bugs(past_fixed_bugs_by, commit, component)
                 for commit in commit_list),
                [],
            )
            prev_regression_blocked_bugs: List[Dict[str, Any]] = sum(
                (get_prev_bugs(past_regression_blocked_bugs_by, commit,
                               component) for commit in commit_list),
                [],
            )
            prev_fixed_bug_blocked_bugs: List[Dict[str, Any]] = sum(
                (get_prev_bugs(past_fixed_bug_blocked_bugs_by, commit,
                               component) for commit in commit_list),
                [],
            )

            prev_regressions = _deduplicate(prev_regressions)
            prev_fixed_bugs = _deduplicate(prev_fixed_bugs)
            prev_regression_blocked_bugs = _deduplicate(
                prev_regression_blocked_bugs)
            prev_fixed_bug_blocked_bugs = _deduplicate(
                prev_fixed_bug_blocked_bugs)

            regression_components = component_histogram(prev_regressions)
            fixed_bugs_components = component_histogram(prev_fixed_bugs)
            regression_blocked_bug_components = component_histogram(
                prev_regression_blocked_bugs)
            fixed_bug_blocked_bug_components = component_histogram(
                prev_fixed_bug_blocked_bugs)

            commit_group[
                "most_common_regression_components"] = regression_components
            # These are only used for component connections for the time being.
            if component:
                commit_group["prev_regressions"] = prev_regressions[-3:]
                commit_group["prev_fixed_bugs"] = prev_fixed_bugs[-3:]
                commit_group[
                    "prev_regression_blocked_bugs"] = prev_regression_blocked_bugs[
                        -3:]
                commit_group[
                    "prev_fixed_bug_blocked_bugs"] = prev_fixed_bug_blocked_bugs[
                        -3:]
                commit_group[
                    "most_common_fixed_bugs_components"] = fixed_bugs_components
                commit_group[
                    "most_common_regression_blocked_bug_components"] = regression_blocked_bug_components
                commit_group[
                    "most_common_fixed_bug_blocked_bug_components"] = fixed_bug_blocked_bug_components

        def get_commit_data(
                commit_list: List[repository.CommitDict]) -> List[dict]:
            if len(commit_list) == 0:
                return []

            # Evaluate risk of commits associated to this bug.
            probs = self.regressor_model.classify(commit_list,
                                                  probabilities=True)

            commits_data = []
            for i, commit in enumerate(commit_list):
                revision_id = repository.get_revision_id(commit)
                if revision_id in revision_map:
                    testing = phabricator.get_testing_project(
                        revision_map[revision_id])

                    if testing is None:
                        testing = "missing"
                else:
                    testing = None

                commits_data.append({
                    "id":
                    commit["node"],
                    "testing":
                    testing,
                    "risk":
                    float(probs[i][1]),
                    "backedout":
                    bool(commit["backedoutby"]),
                    "author":
                    commit["author_email"],
                    "reviewers":
                    commit["reviewers"],
                    "coverage": [
                        commit["cov_added"],
                        commit["cov_covered"],
                        commit["cov_unknown"],
                    ],
                })

            return commits_data

        # Sort commits by bug ID, so we can use itertools.groupby to group them by bug ID.
        commits.sort(key=lambda x: x["bug_id"])

        bug_to_commits = {}
        for bug_id, commit_iter in itertools.groupby(commits,
                                                     lambda x: x["bug_id"]):
            # TODO: Figure out what to do with bugs we couldn't download (security bugs).
            if bug_id not in bug_map:
                continue

            bug_to_commits[bug_id] = sorted(
                commit_iter, key=lambda x: hash_to_rev[x["node"]])

        bug_summaries = []
        for bug_id in bugs:
            if bug_id not in bug_map:
                continue

            commit_list = bug_to_commits.get(bug_id, [])
            commit_data = get_commit_data(commit_list)

            bug = bug_map[bug_id]

            bug_summary = {
                "id":
                bug_id,
                "regressor":
                bug_id in regressor_bug_ids,
                "regression":
                len(bug["regressed_by"]) > 0
                or any(keyword in bug["keywords"]
                       for keyword in ["regression", "talos-regression"])
                or ("cf_has_regression_range" in bug
                    and bug["cf_has_regression_range"] == "yes"),
                "whiteboard":
                bug["whiteboard"],
                "assignee":
                bug["assigned_to"]
                if bug["assigned_to"] != "*****@*****.**" else None,
                "versions":
                bugzilla.get_fixed_versions(bug),
                "component":
                get_full_component(bug),
                "team":
                bugzilla.component_to_team(component_team_mapping,
                                           bug["product"], bug["component"]),
                "summary":
                bug["summary"],
                "types":
                bug_to_types(bug),
                "severity":
                bug["severity"],
                "creation_date":
                dateutil.parser.parse(
                    bug["creation_time"]).strftime("%Y-%m-%d"),
                "date":
                max(
                    dateutil.parser.parse(commit["pushdate"])
                    for commit in commit_list).strftime("%Y-%m-%d")
                if len(commit_list) > 0 else None,
                "commits":
                commit_data,
                "meta_ids":
                list(blocker_to_meta[bug_id]),
                "risk_band":
                find_risk_band(max(commit["risk"] for commit in commit_data))
                if len(commit_data) > 0 else None,
            }

            get_prev_bugs_stats(bug_summary, commit_list)

            bug_summaries.append(bug_summary)

        landings_by_date = collections.defaultdict(list)
        for bug_summary in bug_summaries:
            landings_by_date[bug_summary["creation_date"]].append(bug_summary)

        with open("landings_by_date.json", "w") as f:
            output: dict = {
                "summaries": landings_by_date,
            }
            if meta_bugs is not None:
                output["featureMetaBugs"] = [{
                    "id":
                    meta_bug,
                    "summary":
                    bug_map[meta_bug]["summary"]
                } for meta_bug in meta_bugs]

            json.dump(output, f)

        # Retrieve components of test failures that occurred when landing patches to fix bugs in specific components.
        component_failures = collections.defaultdict(list)

        push_data_iter, push_data_count, all_runnables = test_scheduling.get_push_data(
            "group")

        for revisions, _, _, possible_regressions, likely_regressions in tqdm(
                push_data_iter(), total=push_data_count):
            commit_list = [
                commit_map[revision] for revision in revisions
                if revision in commit_map
            ]
            if len(commit_list) == 0:
                continue

            commit_bugs = [
                bug_map[commit["bug_id"]] for commit in commit_list
                if commit["bug_id"] in bug_map
            ]

            components = list(
                set(get_full_component(bug) for bug in commit_bugs))

            groups = [
                group for group in list(
                    set(possible_regressions + likely_regressions))
                if group.encode("utf-8") in path_to_component
            ]

            for group in groups:
                for component in components:
                    component_failures[component].append(path_to_component[
                        group.encode("utf-8")].tobytes().decode("utf-8"))

        # Filter out commits for which we have no bugs.
        commits = [commit for commit in commits if commit["bug_id"] in bug_map]

        # Sort commits by bug component, so we can use itertools.groupby to group them by bug component.
        commits.sort(key=lambda x: get_full_component(bug_map[x["bug_id"]]))

        commit_groups = []
        for component, commit_iter in itertools.groupby(
                commits, lambda x: get_full_component(bug_map[x["bug_id"]])):
            commit_group = {
                "component": component,
                "most_common_test_failure_components":
                histogram(component_failures[component])
                if component in component_failures else {},
            }
            get_prev_bugs_stats(commit_group, list(commit_iter), component)
            commit_groups.append(commit_group)

        with open("component_connections.json", "w") as f:
            json.dump(commit_groups, f)

        repository.close_component_mapping()
 def get_blocking_of(self, bug_ids: List[int]) -> List[int]:
     bugzilla.download_bugs(bug_ids)
     bug_map = {bug["id"]: bug for bug in bugzilla.get_bugs()}
     return sum((bugzilla.find_blocking(bug_map, bug_map[bug_id])
                 for bug_id in bug_ids), [])
예제 #12
0
    def retrieve_bugs(self, limit=None):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids(
            {"f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date()}
        )
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(
            two_years_and_six_months_ago, six_months_ago
        )
        if limit:
            timespan_ids = timespan_ids[:limit]
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        if limit:
            labelled_bug_ids = labelled_bug_ids[:limit]
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
        # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped.
        if limit is None:
            assert db.download(repository.COMMITS_DB)

        # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
        start_date = datetime.now() - relativedelta(years=3)
        commit_bug_ids = [
            commit["bug_id"]
            for commit in repository.get_commits()
            if commit["bug_id"]
            and dateutil.parser.parse(commit["pushdate"]) >= start_date
        ]
        if limit:
            commit_bug_ids = commit_bug_ids[-limit:]
        logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which are regressions and bugs which caused regressions (useful for the regressor model).
        regressed_by_bug_ids = sum(
            (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()),
            [],
        )
        if limit:
            regressed_by_bug_ids = regressed_by_bug_ids[-limit:]
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        all_ids = (
            timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids
        )
        all_ids_set = set(all_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(
            lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set
        )

        bugzilla.download_bugs(all_ids)

        # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs).
        regressed_by_bug_ids = sum(
            (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()),
            [],
        )
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        bugzilla.download_bugs(regressed_by_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs(include_invalid=True)
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        zstd_compress("data/bugs.json")
예제 #13
0
    def go(self,
           bugs: List[int],
           meta_bugs: Optional[List[int]] = None) -> None:
        if meta_bugs is not None:
            bugs += meta_bugs + self.get_blocking_of(meta_bugs)

        logger.info("Download bugs of interest...")
        bugzilla.download_bugs(bugs)

        bugs_set = set(bugs)

        commits = [
            commit for commit in repository.get_commits()
            if commit["bug_id"] in bugs_set
        ]
        hash_to_rev = {commit["node"]: i for i, commit in enumerate(commits)}

        logger.info(f"{len(commits)} commits to analyze.")

        bug_ids = {commit["bug_id"] for commit in commits}

        logger.info(f"{len(bug_ids)} bugs to analyze.")

        bug_map = {}
        regressor_bug_ids = set()
        for bug in bugzilla.get_bugs():
            if bug["id"] in bugs_set:
                bug_map[bug["id"]] = bug

            if len(bug["regressions"]) > 0:
                regressor_bug_ids.add(bug["id"])

        logger.info("Retrieve Phabricator revisions linked to commits...")
        revision_ids = set(
            filter(None,
                   (repository.get_revision_id(commit) for commit in commits)))

        logger.info("Download revisions of interest...")
        phabricator.download_revisions(revision_ids)

        revision_map = {
            revision["id"]: revision
            for revision in phabricator.get_revisions()
            if revision["id"] in revision_ids
        }

        if meta_bugs is not None:
            blocker_to_meta = collections.defaultdict(set)
            for meta_bug in meta_bugs:
                if meta_bug not in bug_map:
                    continue

                for blocker_bug_id in bugzilla.find_blocking(
                        bug_map, bug_map[meta_bug]):
                    blocker_to_meta[blocker_bug_id].add(meta_bug)

        # TODO: Use past regressions by function information too (maybe first by function and if no results by file? or prioritize function and recentness?)

        def _download_past_bugs(url: str) -> dict:
            path = os.path.join("data", os.path.basename(url)[:-4])
            download_check_etag(url, path=f"{path}.zst")
            zstd_decompress(path)
            assert os.path.exists(path)
            with open(path, "r") as f:
                return json.load(f)

        past_regressions_by_file = _download_past_bugs(
            PAST_REGRESSIONS_BY_FILE_URL)
        past_fixed_bugs_by_file = _download_past_bugs(
            PAST_FIXED_BUGS_BY_FILE_URL)
        past_regression_blocked_bugs_by_file = _download_past_bugs(
            PAST_REGRESSION_BLOCKED_BUGS_BY_FILE_URL)
        past_fixed_bug_blocked_bugs_by_file = _download_past_bugs(
            PAST_FIXED_BUG_BLOCKED_BUGS_BY_FILE_URL)

        def component_histogram(bugs: List[dict]) -> Dict[str, float]:
            counter = collections.Counter(bug["component"] for bug in bugs)
            return {
                component: count / len(bugs)
                for component, count in counter.most_common()
            }

        # Sort commits by bug ID, so we can use itertools.groupby to group them by bug ID.
        commits.sort(key=lambda x: x["bug_id"])

        commit_groups = []
        for bug_id, commit_iter in itertools.groupby(commits,
                                                     lambda x: x["bug_id"]):
            # TODO: Figure out what to do with bugs we couldn't download (security bugs).
            if bug_id not in bug_map:
                continue

            commit_list = list(commit_iter)
            commit_list.sort(key=lambda x: hash_to_rev[x["node"]])

            # Find previous regressions occurred in the same files as those touched by these commits.
            # And find previous bugs that were fixed by touching the same files as these commits.
            # And find previous bugs that were blocked by regressions occurred in the same files as those touched by these commits.
            # And find previous bugs that were blocked by bugs that were fixed by touching the same files as those touched by these commits.
            prev_regressions: List[Dict[str, Any]] = []
            prev_fixed_bugs: List[Dict[str, Any]] = []
            prev_regression_blocked_bugs: List[Dict[str, Any]] = []
            prev_fixed_bug_blocked_bugs: List[Dict[str, Any]] = []
            for commit in commit_list:
                for path in commit["files"]:
                    if path in past_regressions_by_file:
                        prev_regressions.extend(
                            bug_summary
                            for bug_summary in past_regressions_by_file[path])

                    if path in past_fixed_bugs_by_file:
                        prev_fixed_bugs.extend(
                            bug_summary
                            for bug_summary in past_fixed_bugs_by_file[path])

                    if path in past_regression_blocked_bugs_by_file:
                        prev_regression_blocked_bugs.extend(
                            bug_summary for bug_summary in
                            past_regression_blocked_bugs_by_file[path])

                    if path in past_fixed_bug_blocked_bugs_by_file:
                        prev_fixed_bug_blocked_bugs.extend(
                            bug_summary for bug_summary in
                            past_fixed_bug_blocked_bugs_by_file[path])

            prev_regressions = _deduplicate(prev_regressions)
            prev_fixed_bugs = _deduplicate(prev_fixed_bugs)
            prev_regression_blocked_bugs = _deduplicate(
                prev_regression_blocked_bugs)
            prev_fixed_bug_blocked_bugs = _deduplicate(
                prev_fixed_bug_blocked_bugs)

            regression_components = component_histogram(prev_regressions)
            fixed_bugs_components = component_histogram(prev_fixed_bugs)
            regression_blocked_bug_components = component_histogram(
                prev_regression_blocked_bugs)
            fixed_bug_blocked_bug_components = component_histogram(
                prev_fixed_bug_blocked_bugs)

            # Evaluate risk of commits associated to this bug.
            probs = self.regressor_model.classify(commit_list,
                                                  probabilities=True)

            commits_data = []
            for i, commit in enumerate(commit_list):
                revision_id = repository.get_revision_id(commit)
                if revision_id in revision_map:
                    testing = phabricator.get_testing_project(
                        revision_map[revision_id])

                    if testing is None:
                        testing = "none"
                else:
                    testing = None

                commits_data.append({
                    "id":
                    commit["node"],
                    "testing":
                    testing,
                    "risk":
                    float(probs[i][1]),
                    "backedout":
                    bool(commit["backedoutby"]),
                    "regressor":
                    commit["bug_id"] in regressor_bug_ids,
                })

            bug = bug_map[bug_id]

            commit_groups.append({
                "id":
                bug_id,
                "versions":
                bugzilla.get_fixed_versions(bug),
                "component":
                "{}::{}".format(bug["product"], bug["component"]),
                "summary":
                bug["summary"],
                "date":
                max(
                    dateutil.parser.parse(commit["pushdate"])
                    for commit in commit_list).strftime("%Y-%m-%d"),
                "commits":
                commits_data,
                "meta_ids":
                list(blocker_to_meta[bug_id]),
                "prev_regressions":
                prev_regressions[-3:],
                "prev_fixed_bugs":
                prev_fixed_bugs[-3:],
                "prev_regression_blocked_bugs":
                prev_regression_blocked_bugs[-3:],
                "prev_fixed_bug_blocked_bugs":
                prev_fixed_bug_blocked_bugs[-3:],
                "most_common_regression_components":
                regression_components,
                "most_common_fixed_bugs_components":
                fixed_bugs_components,
                "most_common_regression_blocked_bug_components":
                regression_blocked_bug_components,
                "most_common_fixed_bug_blocked_bug_components":
                fixed_bug_blocked_bug_components,
            })

        landings_by_date = collections.defaultdict(list)
        for commit_group in commit_groups:
            landings_by_date[commit_group["date"]].append(commit_group)

        with open("landings_by_date.json", "w") as f:
            output: dict = {
                "landings": landings_by_date,
            }
            if meta_bugs is not None:
                output["featureMetaBugs"] = [{
                    "id":
                    meta_bug,
                    "summary":
                    bug_map[meta_bug]["summary"]
                } for meta_bug in meta_bugs]

            json.dump(output, f)
예제 #14
0
    def retrieve_bugs(self):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download_version(bugzilla.BUGS_DB)
        if not db.is_old_version(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids({
            "f1": "delta_ts",
            "o1": "greaterthaneq",
            "v1": last_modified.date()
        })
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago,
                                                six_months_ago)
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        all_ids = set(timespan_ids + labelled_bug_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(
            lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids)

        bugzilla.download_bugs(timespan_ids + labelled_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs()
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        self.compress_file("data/bugs.json")
예제 #15
0
    def retrieve_bugs(self, limit: int = None) -> None:
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = set(
            bugzilla.get_ids({
                "f1": "delta_ts",
                "o1": "greaterthaneq",
                "v1": last_modified.date()
            }))
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        all_components = bugzilla.get_product_component_count(9999)

        deleted_component_ids = set(
            bug["id"] for bug in bugzilla.get_bugs() if "{}::{}".format(
                bug["product"], bug["component"]) not in all_components)
        logger.info(
            f"{len(deleted_component_ids)} bugs belonging to deleted components"
        )
        changed_ids |= deleted_component_ids

        # Get IDs of bugs between (two years and six months ago) and now.
        two_years_and_six_months_ago = datetime.utcnow() - relativedelta(
            years=2, months=6)
        logger.info(f"Retrieving bug IDs since {two_years_and_six_months_ago}")
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago)
        if limit:
            timespan_ids = timespan_ids[-limit:]
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        if limit:
            labelled_bug_ids = labelled_bug_ids[-limit:]
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
        # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped.
        if limit is None:
            assert db.download(repository.COMMITS_DB)

        # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
        start_date = datetime.now() - relativedelta(years=3)
        commit_bug_ids = list(
            set(commit["bug_id"] for commit in repository.get_commits()
                if commit["bug_id"]
                and dateutil.parser.parse(commit["pushdate"]) >= start_date))
        if limit:
            commit_bug_ids = commit_bug_ids[-limit:]
        logger.info(
            f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which are regressions, bugs which caused regressions (useful for the regressor model),
        # and blocked bugs.
        regression_related_ids: List[int] = list(
            set(
                sum(
                    (bug["regressed_by"] + bug["regressions"] + bug["blocks"]
                     for bug in bugzilla.get_bugs()),
                    [],
                )))
        if limit:
            regression_related_ids = regression_related_ids[-limit:]
        logger.info(
            f"{len(regression_related_ids)} bugs which caused regressions fixed by commits."
        )

        # Get IDs of bugs linked to intermittent failures.
        test_failure_bug_ids = [
            item["bug_id"] for item in test_scheduling.get_failure_bugs(
                two_years_and_six_months_ago, datetime.utcnow())
        ]
        if limit:
            test_failure_bug_ids = test_failure_bug_ids[-limit:]
        logger.info(f"{len(test_failure_bug_ids)} bugs about test failures.")

        all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids +
                   regression_related_ids + test_failure_bug_ids)
        all_ids_set = set(all_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"]
                             not in all_ids_set)

        new_bugs = bugzilla.download_bugs(all_ids)

        # Get regression_related_ids again (the set could have changed after downloading new bugs).
        for i in range(7):
            regression_related_ids = list(
                set(
                    sum(
                        (bug["regressed_by"] + bug["regressions"] +
                         bug["blocks"] for bug in new_bugs),
                        [],
                    )))
            logger.info(
                f"{len(regression_related_ids)} bugs which caused regressions fixed by commits."
            )
            if limit:
                regression_related_ids = regression_related_ids[-limit:]

            # If we got all bugs we needed, break.
            if set(regression_related_ids).issubset(all_ids):
                break

            new_bugs = bugzilla.download_bugs(regression_related_ids)

        # Try to re-download inconsistent bugs, up to twice.
        inconsistent_bugs = bugzilla.get_bugs(include_invalid=True)
        for i in range(2):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        # TODO: Figure out why.
        missing_history_bug_ids = {
            bug["id"]
            for bug in bugzilla.get_bugs() if "history" not in bug
        }
        bugzilla.delete_bugs(lambda bug: bug["id"] in missing_history_bug_ids)
        logger.info(
            f"Deleted {len(missing_history_bug_ids)} bugs as we couldn't retrieve their history"
        )

        zstd_compress(bugzilla.BUGS_DB)
예제 #16
0
    def go(self, days_start: int, days_end: int) -> None:
        commits = self.get_landed_since(days_start, days_end)

        logger.info("Retrieve Phabricator revisions linked to commits...")
        revision_ids = set(
            filter(None,
                   (repository.get_revision_id(commit) for commit in commits)))

        logger.info("Download revisions of interest...")
        phabricator.download_revisions(revision_ids)

        revision_map = {
            revision["id"]: revision
            for revision in phabricator.get_revisions()
            if revision["id"] in revision_ids
        }

        logger.info("Download bugs of interest...")
        bugzilla.download_bugs(commit["bug_id"] for commit in commits
                               if commit["bug_id"])

        # Filter-out commits with no Phabricator revision linked to them, or with no testing tags.
        commits = [
            commit for commit in commits
            if repository.get_revision_id(commit) in revision_map
        ]
        logger.info(f"{len(commits)} revisions")

        # Filter-out commits with no testing tags.
        commits = [
            commit for commit in commits if phabricator.get_testing_project(
                revision_map[repository.get_revision_id(commit)]) is not None
        ]
        logger.info(f"{len(commits)} revisions with testing tags")

        def list_testing_projects(
            commits: Iterable[repository.CommitDict], ) -> Collection[str]:
            return list(
                filter(
                    None,
                    (phabricator.get_testing_project(
                        revision_map[repository.get_revision_id(commit)])
                     for commit in commits),
                ))

        testing_projects = list_testing_projects(commits)

        print(f"Most common testing tags (in {len(commits)} revisions):")
        for testing_project, count in collections.Counter(
                testing_projects).most_common():
            print(
                f"{testing_project} - {round(100 * count / len(testing_projects), 1)}%"
            )

        backedout_commits = [
            commit for commit in commits if commit["backedoutby"]
        ]
        backedout_testing_projects = list_testing_projects(backedout_commits)

        print(
            f"\nMost common testing tags for backed-out revisions (in {len(backedout_commits)} revisions):"
        )
        for testing_project, count in collections.Counter(
                backedout_testing_projects).most_common():
            print(
                f"{testing_project} - {round(100 * count / len(backedout_testing_projects), 1)}%"
            )

        regressor_bug_ids = {
            bug["id"]
            for bug in bugzilla.get_bugs() if len(bug["regressions"]) > 0
        }

        regressor_commits = [
            commit for commit in commits
            if commit["bug_id"] in regressor_bug_ids
        ]
        regressor_testing_projects = list_testing_projects(regressor_commits)

        print(
            f"\nMost common testing tags for revisions which caused regressions (in {len(regressor_commits)} revisions):"
        )
        for testing_project, count in collections.Counter(
                regressor_testing_projects).most_common():
            print(
                f"{testing_project} - {round(100 * count / len(regressor_testing_projects), 1)}%"
            )