Esempio n. 1
0
def test_download_missing(tmp_path):
    url = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst"

    db_path = tmp_path / "prova.json"
    db.register(db_path, url, 1)

    responses.add(
        responses.HEAD,
        url,
        status=404,
        headers={
            "ETag": "123",
            "Accept-Encoding": "zstd"
        },
    )

    responses.add(responses.GET,
                  url,
                  status=404,
                  body=requests.exceptions.HTTPError("HTTP error"))

    db.download(db_path)
    assert not os.path.exists(db_path)

    with pytest.raises(Exception, match="Last-Modified is not available"):
        db.last_modified(db_path)
Esempio n. 2
0
def test_download_different_schema(tmp_path, mock_zst):
    url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_commits.latest/artifacts/public/prova.json.zst"
    url_version = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_commits.latest/artifacts/public/prova.json.version"

    db_path = tmp_path / "prova.json"
    db.register(db_path, url, 2)

    responses.add(responses.GET, url_version, status=200, body="1")

    responses.add(
        responses.HEAD,
        url,
        status=200,
        headers={
            "ETag": "123",
            "Accept-Encoding": "zstd",
            "Last-Modified": "2019-04-16",
        },
    )

    tmp_zst_path = tmp_path / "prova_tmp.zst"
    mock_zst(tmp_zst_path)

    with open(tmp_zst_path, "rb") as content:
        responses.add(responses.GET, url, status=200, body=content.read())

    assert not db.download(db_path)

    with pytest.raises(db.LastModifiedNotAvailable):
        db.last_modified(db_path)

    assert not os.path.exists(db_path)
    assert not os.path.exists(db_path.with_suffix(db_path.suffix + ".zst"))
    assert not os.path.exists(
        db_path.with_suffix(db_path.suffix + ".zst.etag"))
Esempio n. 3
0
def test_download_missing(tmp_path, mock_zst):
    url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_commits.latest/artifacts/public/prova.json.zst"
    url_version = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_commits.latest/artifacts/public/prova.json.version"

    db_path = tmp_path / "prova.json"
    db.register(db_path, url, 1)

    responses.add(
        responses.HEAD,
        url,
        status=404,
        headers={
            "ETag": "123",
            "Accept-Encoding": "zstd"
        },
    )

    responses.add(responses.GET,
                  url,
                  status=404,
                  body=requests.exceptions.HTTPError("HTTP error"))

    responses.add(responses.GET, url_version, status=404)

    assert not db.download(db_path)
    assert not os.path.exists(db_path)

    with pytest.raises(LastModifiedNotAvailable):
        db.last_modified(db_path)
Esempio n. 4
0
def test_download_zst(tmp_path, mock_zst):
    url = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst"

    db_path = tmp_path / "prova.json"
    db.register(db_path, url, 1)

    responses.add(
        responses.HEAD,
        url,
        status=200,
        headers={
            "ETag": "123",
            "Accept-Encoding": "zstd",
            "Last-Modified": "2019-04-16",
        },
    )

    tmp_zst_path = tmp_path / "prova_tmp.zst"
    mock_zst(tmp_zst_path)

    with open(tmp_zst_path, "rb") as content:
        responses.add(responses.GET, url, status=200, body=content.read())

    db.download(db_path)

    assert db.last_modified(db_path) == datetime(2019, 4, 16)

    assert os.path.exists(db_path)
    assert os.path.exists(db_path.with_suffix(db_path.suffix + ".zst"))
    assert os.path.exists(db_path.with_suffix(db_path.suffix + ".zst.etag"))
Esempio n. 5
0
    def retrieve_issues(self, owner: str, repo: str, state: str,
                        retrieve_events: bool) -> None:

        last_modified = None
        db.download(github.GITHUB_ISSUES_DB)

        try:
            last_modified = db.last_modified(github.GITHUB_ISSUES_DB)
        except Exception:
            pass

        if last_modified:
            logger.info(
                f"Retrieving issues modified or created since the last run on {last_modified.isoformat()}"
            )
            data = github.fetch_issues_updated_since_timestamp(
                owner, repo, state, last_modified.isoformat(), retrieve_events)

            updated_ids = set(issue["id"] for issue in data)

            logger.info(
                "Deleting issues that were changed since the last run and saving updates"
            )
            github.delete_issues(lambda issue: issue["id"] in updated_ids)

            db.append(github.GITHUB_ISSUES_DB, data)
            logger.info("Updating finished")
        else:
            logger.info(
                "Retrieving all issues since last_modified is not available")
            github.download_issues(owner, repo, state, retrieve_events)

        zstd_compress(github.GITHUB_ISSUES_DB)
Esempio n. 6
0
    def retrieve_issues(self) -> None:

        last_modified = None
        db.download(self.github.db_path)

        try:
            last_modified = db.last_modified(self.github.db_path)
        except db.LastModifiedNotAvailable:
            pass

        if last_modified:
            logger.info(
                f"Retrieving issues modified or created since the last run on {last_modified.isoformat()}"
            )
            data = self.github.fetch_issues_updated_since_timestamp(
                last_modified.isoformat())

            if self.retrieve_private:
                logger.info(
                    "Replacing contents of auto closed public issues with private issues content"
                )
                self.replace_with_private(data)

            updated_ids = set(issue["id"] for issue in data)

            logger.info(
                "Deleting issues that were changed since the last run and saving updates"
            )
            self.github.delete_issues(lambda issue: issue["id"] in updated_ids)

            db.append(self.github.db_path, data)
            logger.info("Updating finished")
        else:
            logger.info(
                "Retrieving all issues since last_modified is not available")
            self.github.download_issues()

            if self.retrieve_private:
                logger.info(
                    "Replacing contents of auto closed public issues with private issues content"
                )

                all_issues = list(self.github.get_issues())
                updated_issues, updated_ids = self.replace_with_private(
                    all_issues)

                logger.info(
                    "Deleting public issues that were updated and saving updates"
                )
                self.github.delete_issues(
                    lambda issue: issue["id"] in updated_ids)
                db.append(self.github.db_path, updated_issues)

        zstd_compress(self.github.db_path)
Esempio n. 7
0
def download_modified_revisions():
    try:
        last_modified = db.last_modified(REVISIONS_DB)
    except LastModifiedNotAvailable:
        return

    modified_revisions = get(modified_start=last_modified)
    modified_revision_ids = set(rev["id"] for rev in modified_revisions)

    db.delete(REVISIONS_DB,
              lambda revision: revision["id"] in modified_revision_ids)

    db.append(REVISIONS_DB, modified_revisions)
    def go(self, days: int) -> None:
        bugs = self.get_landed_and_filed_since(days)

        meta_bugs = self.get_meta_bugs(days)

        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Deleting bugs modified since the last run on {last_modified}")
        changed_ids = bugzilla.get_ids({
            "f1": "delta_ts",
            "o1": "greaterthaneq",
            "v1": last_modified.date()
        })
        bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids)

        bugs = list(set(bugs))

        test_infos = self.retrieve_test_info(days)
        test_info_bugs: List[int] = [
            bug["id"] for test_info in test_infos.values()
            for bug in test_info["bugs"]
        ]

        logger.info("Download bugs of interest...")
        bugzilla.download_bugs(bugs + test_info_bugs + [FUZZING_METABUG_ID] +
                               meta_bugs)

        logger.info(f"{len(bugs)} bugs to analyze.")

        bugs_set = set(bugs + test_info_bugs + meta_bugs)

        bug_map = {}
        regressor_bug_ids = set()
        for bug in bugzilla.get_bugs():
            # Only add to the map bugs we are interested in, and bugs that block other bugs (needed for the bug_to_types call).
            if bug["id"] in bugs_set or len(bug["blocks"]) > 0:
                bug_map[bug["id"]] = bug

            if len(bug["regressions"]) > 0:
                regressor_bug_ids.add(bug["id"])

        self.generate_landings_by_date(bug_map, regressor_bug_ids, bugs,
                                       self.get_blocking_of(meta_bugs))

        self.generate_component_connections(bug_map, bugs)

        self.generate_component_test_stats(bug_map, test_infos)
Esempio n. 9
0
def test_download_xz(tmp_path, mock_xz):
    url_zst = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst"
    url_xz = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.xz"

    db_path = tmp_path / "prova.json"
    db.register(db_path, url_zst, 1)

    responses.add(
        responses.HEAD,
        url_zst,
        status=404,
        headers={"ETag": "123", "Accept-Encoding": "zstd"},
    )

    responses.add(
        responses.GET,
        url_zst,
        status=404,
        body=requests.exceptions.HTTPError("HTTP error"),
    )

    responses.add(
        responses.HEAD,
        url_xz,
        status=200,
        headers={"ETag": "123", "Accept-Encoding": "xz", "Last-Modified": "2019-04-16"},
    )

    tmp_xz_path = tmp_path / "prova_tmp.xz"
    mock_xz(tmp_xz_path)

    with open(tmp_xz_path, "rb") as content:
        responses.add(responses.GET, url_xz, status=200, body=content.read())

    db.download(db_path)

    assert db.last_modified(db_path) == datetime(2019, 4, 16)

    assert os.path.exists(db_path)
    assert os.path.exists(db_path.with_suffix(db_path.suffix + ".xz"))
    assert os.path.exists(db_path.with_suffix(db_path.suffix + ".xz.etag"))
Esempio n. 10
0
    def retrieve_bugs(self, limit=None):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids(
            {"f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date()}
        )
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(
            two_years_and_six_months_ago, six_months_ago
        )
        if limit:
            timespan_ids = timespan_ids[:limit]
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        if limit:
            labelled_bug_ids = labelled_bug_ids[:limit]
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
        # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped.
        if limit is None:
            assert db.download(repository.COMMITS_DB)

        # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
        start_date = datetime.now() - relativedelta(years=3)
        commit_bug_ids = [
            commit["bug_id"]
            for commit in repository.get_commits()
            if commit["bug_id"]
            and dateutil.parser.parse(commit["pushdate"]) >= start_date
        ]
        if limit:
            commit_bug_ids = commit_bug_ids[-limit:]
        logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which are regressions and bugs which caused regressions (useful for the regressor model).
        regressed_by_bug_ids = sum(
            (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()),
            [],
        )
        if limit:
            regressed_by_bug_ids = regressed_by_bug_ids[-limit:]
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        all_ids = (
            timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids
        )
        all_ids_set = set(all_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(
            lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set
        )

        bugzilla.download_bugs(all_ids)

        # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs).
        regressed_by_bug_ids = sum(
            (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()),
            [],
        )
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        bugzilla.download_bugs(regressed_by_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs(include_invalid=True)
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        zstd_compress("data/bugs.json")
Esempio n. 11
0
    def retrieve_bugs(self):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download_version(bugzilla.BUGS_DB)
        if not db.is_old_version(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids({
            "f1": "delta_ts",
            "o1": "greaterthaneq",
            "v1": last_modified.date()
        })
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago,
                                                six_months_ago)
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        all_ids = set(timespan_ids + labelled_bug_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(
            lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids)

        bugzilla.download_bugs(timespan_ids + labelled_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs()
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        self.compress_file("data/bugs.json")
Esempio n. 12
0
    def retrieve_bugs(self, limit: int = None) -> None:
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = set(
            bugzilla.get_ids({
                "f1": "delta_ts",
                "o1": "greaterthaneq",
                "v1": last_modified.date()
            }))
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        all_components = bugzilla.get_product_component_count(9999)

        deleted_component_ids = set(
            bug["id"] for bug in bugzilla.get_bugs() if "{}::{}".format(
                bug["product"], bug["component"]) not in all_components)
        logger.info(
            f"{len(deleted_component_ids)} bugs belonging to deleted components"
        )
        changed_ids |= deleted_component_ids

        # Get IDs of bugs between (two years and six months ago) and now.
        two_years_and_six_months_ago = datetime.utcnow() - relativedelta(
            years=2, months=6)
        logger.info(f"Retrieving bug IDs since {two_years_and_six_months_ago}")
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago)
        if limit:
            timespan_ids = timespan_ids[-limit:]
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        if limit:
            labelled_bug_ids = labelled_bug_ids[-limit:]
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
        # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped.
        if limit is None:
            assert db.download(repository.COMMITS_DB)

        # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
        start_date = datetime.now() - relativedelta(years=3)
        commit_bug_ids = list(
            set(commit["bug_id"] for commit in repository.get_commits()
                if commit["bug_id"]
                and dateutil.parser.parse(commit["pushdate"]) >= start_date))
        if limit:
            commit_bug_ids = commit_bug_ids[-limit:]
        logger.info(
            f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which are regressions, bugs which caused regressions (useful for the regressor model),
        # and blocked bugs.
        regression_related_ids: List[int] = list(
            set(
                sum(
                    (bug["regressed_by"] + bug["regressions"] + bug["blocks"]
                     for bug in bugzilla.get_bugs()),
                    [],
                )))
        if limit:
            regression_related_ids = regression_related_ids[-limit:]
        logger.info(
            f"{len(regression_related_ids)} bugs which caused regressions fixed by commits."
        )

        # Get IDs of bugs linked to intermittent failures.
        test_failure_bug_ids = [
            item["bug_id"] for item in test_scheduling.get_failure_bugs(
                two_years_and_six_months_ago, datetime.utcnow())
        ]
        if limit:
            test_failure_bug_ids = test_failure_bug_ids[-limit:]
        logger.info(f"{len(test_failure_bug_ids)} bugs about test failures.")

        all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids +
                   regression_related_ids + test_failure_bug_ids)
        all_ids_set = set(all_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"]
                             not in all_ids_set)

        new_bugs = bugzilla.download_bugs(all_ids)

        # Get regression_related_ids again (the set could have changed after downloading new bugs).
        for i in range(7):
            regression_related_ids = list(
                set(
                    sum(
                        (bug["regressed_by"] + bug["regressions"] +
                         bug["blocks"] for bug in new_bugs),
                        [],
                    )))
            logger.info(
                f"{len(regression_related_ids)} bugs which caused regressions fixed by commits."
            )
            if limit:
                regression_related_ids = regression_related_ids[-limit:]

            # If we got all bugs we needed, break.
            if set(regression_related_ids).issubset(all_ids):
                break

            new_bugs = bugzilla.download_bugs(regression_related_ids)

        # Try to re-download inconsistent bugs, up to twice.
        inconsistent_bugs = bugzilla.get_bugs(include_invalid=True)
        for i in range(2):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        # TODO: Figure out why.
        missing_history_bug_ids = {
            bug["id"]
            for bug in bugzilla.get_bugs() if "history" not in bug
        }
        bugzilla.delete_bugs(lambda bug: bug["id"] in missing_history_bug_ids)
        logger.info(
            f"Deleted {len(missing_history_bug_ids)} bugs as we couldn't retrieve their history"
        )

        zstd_compress(bugzilla.BUGS_DB)