Example #1
0
def print_uncaught(
    granularity: str, scheduler1: str, scheduler2: Optional[str] = None
) -> None:
    push_data_db = (
        test_scheduling.PUSH_DATA_GROUP_DB
        if granularity == "group"
        else test_scheduling.PUSH_DATA_CONFIG_GROUP_DB
    )
    assert db.download(push_data_db)

    regressions_by_rev = {}
    for revisions, _, _, possible_regressions, likely_regressions in db.read(
        push_data_db
    ):
        regressions_by_rev[revisions[0]] = get_regressions(
            granularity, likely_regressions, possible_regressions
        )

    for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB):
        if len(scheduler_stat["schedulers"]) == 0:
            continue

        rev = scheduler_stat["id"]

        if rev not in regressions_by_rev:
            continue

        regressions = regressions_by_rev[rev]

        if len(regressions) == 0:
            continue

        scheduled_by_scheduler = {}
        caught_by_scheduler = {}

        for scheduler in scheduler_stat["schedulers"]:
            scheduled = get_scheduled(granularity, scheduler)

            scheduled_by_scheduler[scheduler["name"]] = scheduled
            caught_by_scheduler[scheduler["name"]] = regressions & scheduled

        if scheduler1 not in caught_by_scheduler:
            continue

        if len(caught_by_scheduler[scheduler1]) == 0:
            if scheduler2 is not None and scheduler2 not in caught_by_scheduler:
                print(
                    f"{scheduler1} didn't catch any of the {len(regressions)} regressions on {rev}"
                )
            elif scheduler2 is not None and len(caught_by_scheduler[scheduler2]) == 0:
                print(
                    f"{scheduler1} and {scheduler2} didn't catch any of the {len(regressions)} regressions on {rev}"
                )
            else:
                print(
                    f"{scheduler1} didn't catch any of the {len(regressions)} regressions on {rev}, while {scheduler2} did"
                )
            print(f"Regressions: {regressions}")
            print(f"Scheduled by {scheduler1}: {scheduled_by_scheduler[scheduler1]}")
Example #2
0
def test_delete(mock_db, db_format, db_compression):
    db_path = mock_db(db_format, db_compression)

    db.write(db_path, range(1, 9))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7, 8]

    db.delete(db_path, lambda x: x == 4)

    assert list(db.read(db_path)) == [1, 2, 3, 5, 6, 7, 8]
Example #3
0
def test_append(mock_db, db_format, db_compression):
    db_path = mock_db(db_format, db_compression)

    db.write(db_path, range(1, 4))

    assert list(db.read(db_path)) == [1, 2, 3]

    db.append(db_path, range(4, 8))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
Example #4
0
def test_append(mock_db, db_format, db_compression):
    db_path = mock_db(db_format, db_compression)

    db.write(db_path, range(1, 4))

    assert list(db.read(db_path)) == [1, 2, 3]

    db.append(db_path, range(4, 8))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
Example #5
0
def test_delete(mock_db, db_format, db_compression):
    db_path = mock_db(db_format, db_compression)

    db.write(db_path, range(1, 9))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7, 8]

    db.delete(db_path, lambda x: x == 4)

    assert list(db.read(db_path)) == [1, 2, 3, 5, 6, 7, 8]
Example #6
0
def test_append_compressed(tmp_path):
    db_path = tmp_path / 'prova.json.gz'

    db.register(db_path, 'https://alink', 1)

    db.write(db_path, range(1, 4))

    assert list(db.read(db_path)) == [1, 2, 3]

    db.append(db_path, range(4, 8))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
Example #7
0
def main():
    description = "Find bug-introducing commits from bug-fixing commits"
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument("what",
                        choices=["to_ignore", "bug_fixing", "bug_introducing"])
    parser.add_argument(
        "--repo_dir",
        help=
        "Path to a Gecko repository. If no repository exists, it will be cloned to this location.",
    )
    parser.add_argument("--git_repo_url",
                        help="URL to the git repository on which to run SZZ.")
    parser.add_argument("--git_repo_dir",
                        help="Path where the git repository will be cloned.")
    parser.add_argument(
        "--tokenized_git_repo_url",
        help="URL to the tokenized git repository on which to run SZZ.",
    )
    parser.add_argument(
        "--tokenized_git_repo_dir",
        help="Path where the tokenized git repository will be cloned.",
    )

    args = parser.parse_args()

    regressor_finder = RegressorFinder(
        args.repo_dir,
        args.git_repo_url,
        args.git_repo_dir,
        args.tokenized_git_repo_url,
        args.tokenized_git_repo_dir,
    )

    if args.what == "to_ignore":
        regressor_finder.get_commits_to_ignore()
    elif args.what == "bug_fixing":
        regressor_finder.find_bug_fixing_commits()
    elif args.what == "bug_introducing":
        assert args.git_repo_url or args.tokenized_git_repo_url

        if args.git_repo_url:
            assert not args.tokenized_git_repo_url
            regressor_finder.find_bug_introducing_commits(
                args.git_repo_dir, False)
            evaluate(db.read(BUG_INTRODUCING_COMMITS_DB))

        if args.tokenized_git_repo_url:
            assert not args.git_repo_url
            regressor_finder.find_bug_introducing_commits(
                args.tokenized_git_repo_dir, True)
            evaluate(db.read(TOKENIZED_BUG_INTRODUCING_COMMITS_DB))
Example #8
0
def test_delete_compressed(tmp_path):
    db_path = tmp_path / 'prova.json.gz'

    print(db_path)

    db.register(db_path, 'https://alink', 1)

    db.write(db_path, range(1, 9))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7, 8]

    db.delete(db_path, lambda x: x == 4)

    assert list(db.read(db_path)) == [1, 2, 3, 5, 6, 7, 8]
Example #9
0
    def get_commits_to_ignore(self):
        logger.info("Download previous commits to ignore...")
        if db.is_old_version(
                IGNORED_COMMITS_DB) or not db.exists(IGNORED_COMMITS_DB):
            db.download(IGNORED_COMMITS_DB, force=True)

        logger.info("Get previously classified commits...")
        prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB))
        logger.info(
            f"Already found {len(prev_commits_to_ignore)} commits to ignore..."
        )

        if len(prev_commits_to_ignore) > 0:
            rev_start = "children({})".format(
                prev_commits_to_ignore[-1]["rev"])
        else:
            rev_start = 0

        # 2 days more than the end date, so we can know if a commit was backed-out.
        # We have to do this as recent commits might be missing in the mercurial <-> git map,
        # otherwise we could just use "tip".
        end_date = datetime.now() - RELATIVE_END_DATE + relativedelta(2)
        with hglib.open(self.mercurial_repo_dir) as hg:
            revs = repository.get_revs(
                hg, rev_start,
                "pushdate('{}')".format(end_date.strftime("%Y-%m-%d")))

        # Given that we use the pushdate, there might be cases where the starting commit is returned too (e.g. if we rerun the task on the same day).
        if len(prev_commits_to_ignore) > 0:
            found_prev = -1
            for i, rev in enumerate(revs):
                if rev.decode("utf-8") == prev_commits_to_ignore[-1]["rev"]:
                    found_prev = i
                    break
            revs = revs[found_prev + 1:]

        commits = repository.hg_log_multi(self.mercurial_repo_dir, revs)

        repository.set_commits_to_ignore(self.mercurial_repo_dir, commits)
        commits_to_ignore = []

        for commit in commits:
            if commit.ignored or commit.backedoutby:
                commits_to_ignore.append({
                    "rev":
                    commit.node,
                    "type":
                    "backedout" if commit.backedoutby else "",
                })

        logger.info(f"{len(commits_to_ignore)} new commits to ignore...")

        logger.info("...of which {} are backed-out".format(
            sum(1 for commit in commits_to_ignore
                if commit["type"] == "backedout")))

        db.append(IGNORED_COMMITS_DB, commits_to_ignore)
        zstd_compress(IGNORED_COMMITS_DB)

        return prev_commits_to_ignore + commits_to_ignore
Example #10
0
    def get_commits_to_ignore(self):
        logger.info("Download previous commits to ignore...")
        db.download(IGNORED_COMMITS_DB)

        logger.info("Get previously classified commits...")
        prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB))
        logger.info(
            f"Already found {len(prev_commits_to_ignore)} commits to ignore..."
        )

        # When we already have some analyzed commits, re-analyze the last 3500 to make sure
        # we didn't miss back-outs that happened since the last analysis.
        if len(prev_commits_to_ignore) > 0:
            first_commit_to_reanalyze = (
                -3500 if len(prev_commits_to_ignore) >= 3500 else 0)
            rev_start = "children({})".format(
                prev_commits_to_ignore[first_commit_to_reanalyze]["rev"])
        else:
            rev_start = 0

        with hglib.open(self.mercurial_repo_dir) as hg:
            revs = repository.get_revs(hg, rev_start)

        commits = repository.hg_log_multi(self.mercurial_repo_dir, revs)

        with hglib.open(self.mercurial_repo_dir) as hg:
            repository.set_commits_to_ignore(hg, self.mercurial_repo_dir,
                                             commits)

        for commit in commits:
            commit.ignored |= commit.author_email == "*****@*****.**"

        chosen_commits = set()
        commits_to_ignore = []
        for commit in commits:
            if commit.ignored or commit.backedoutby:
                commits_to_ignore.append({
                    "rev":
                    commit.node,
                    "type":
                    "backedout" if commit.backedoutby else "",
                })
                chosen_commits.add(commit.node)

        logger.info(f"{len(commits_to_ignore)} new commits to ignore...")

        for prev_commit in prev_commits_to_ignore[::-1]:
            if prev_commit["rev"] not in chosen_commits:
                commits_to_ignore.append(prev_commit)
                chosen_commits.add(prev_commit["rev"])

        logger.info(f"{len(commits_to_ignore)} commits to ignore...")

        logger.info("...of which {} are backed-out".format(
            sum(1 for commit in commits_to_ignore
                if commit["type"] == "backedout")))

        db.write(IGNORED_COMMITS_DB, commits_to_ignore)
        zstd_compress(IGNORED_COMMITS_DB)
        db.upload(IGNORED_COMMITS_DB)
Example #11
0
    def get_labels(self):
        classes = {}

        regressors = set(r["bug_introducing_rev"]
                         for r in db.read(BUG_INTRODUCING_COMMITS_DB)
                         if r["bug_introducing_rev"])

        for commit_data in repository.get_commits():
            if commit_data["ever_backedout"]:
                continue

            node = commit_data["node"]
            if node in regressors:
                classes[node] = 1
            else:
                push_date = dateutil.parser.parse(commit_data["pushdate"])

                # The labels we have are only from two years and six months ago (see the regressor finder script).
                if push_date < datetime.utcnow() - relativedelta(years=2,
                                                                 months=6):
                    continue

                # We remove the last 6 months, as there could be regressions which haven't been filed yet.
                if push_date > datetime.utcnow() - relativedelta(months=6):
                    continue

                classes[node] = 0

        print("{} commits caused regressions".format(
            sum(1 for label in classes.values() if label == 1)))

        print("{} commits did not cause regressions".format(
            sum(1 for label in classes.values() if label == 0)))

        return classes, [0, 1]
Example #12
0
def go(days: int) -> None:
    logger.info("Download previous shadow scheduler statistics...")
    db.download(SHADOW_SCHEDULER_STATS_DB)

    logger.info("Get previously gathered statistics...")
    prev_scheduler_stat_revs = set(
        scheduler_stat["id"]
        for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB))
    logger.info(
        f"Already gathered statistics for {len(prev_scheduler_stat_revs)} pushes..."
    )

    to_date = datetime.utcnow() - relativedelta(days=3)
    from_date = to_date - relativedelta(days=days)
    pushes = mozci.push.make_push_objects(
        from_date=from_date.strftime("%Y-%m-%d"),
        to_date=to_date.strftime("%Y-%m-%d"),
        branch="autoland",
    )

    pushes = [
        push for push in pushes if push.rev not in prev_scheduler_stat_revs
    ]

    logger.info(f"{len(pushes)} left to analyze")

    db.append(SHADOW_SCHEDULER_STATS_DB, analyze_shadow_schedulers(pushes))
    utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB)
Example #13
0
def test_write_read(tmp_path):
    db_path = tmp_path / 'prova.json'

    db.register(db_path, 'https://alink', 1)

    db.write(db_path, range(1, 8))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
Example #14
0
def main():
    description = "Find bug-introducing commits from bug-fixing commits"
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument("cache_root", help="Cache for repository clones.")
    parser.add_argument(
        "git_repo_url", help="URL to the git repository on which to run SZZ."
    )
    parser.add_argument(
        "git_repo_dir", help="Path where the git repository will be cloned."
    )
    parser.add_argument(
        "tokenized_git_repo_url",
        help="URL to the tokenized git repository on which to run SZZ.",
    )
    parser.add_argument(
        "tokenized_git_repo_dir",
        help="Path where the tokenized git repository will be cloned.",
    )

    args = parser.parse_args()

    regressor_finder = RegressorFinder(
        args.cache_root,
        args.git_repo_url,
        args.git_repo_dir,
        args.tokenized_git_repo_url,
        args.tokenized_git_repo_dir,
    )

    commits_to_ignore = regressor_finder.get_commits_to_ignore()

    bug_fixing_commits = regressor_finder.find_bug_fixing_commits()

    tokenized_done = regressor_finder.find_bug_introducing_commits(
        bug_fixing_commits, commits_to_ignore, True
    )
    evaluate(db.read(TOKENIZED_BUG_INTRODUCING_COMMITS_DB))

    done = regressor_finder.find_bug_introducing_commits(
        bug_fixing_commits, commits_to_ignore, False
    )
    evaluate(db.read(BUG_INTRODUCING_COMMITS_DB))

    with open("done", "w") as f:
        f.write(str(1 if tokenized_done and done else 0))
    def retrieve_test_info(self, days: int) -> Dict[str, Any]:
        logger.info("Download previous test info...")
        db.download(TEST_INFOS_DB)

        dates = [
            datetime.utcnow() - timedelta(days=day)
            for day in reversed(range(days))
        ]

        logger.info("Get previously gathered test info...")
        test_infos = {
            test_info["date"]: test_info
            for test_info in db.read(TEST_INFOS_DB)
        }

        prev_skips = None
        for date in tqdm(dates):
            date_str = date.strftime("%Y-%m-%d")

            # Gather the latest three days again, as the data might have changed.
            if date_str in test_infos and date < datetime.utcnow() - timedelta(
                    days=3):
                prev_skips = test_infos[date_str]["skips"]
                continue

            test_infos[date_str] = {
                "date":
                date_str,
                "bugs": [{
                    "id": item["bug_id"],
                    "count": item["bug_count"]
                } for item in test_scheduling.get_failure_bugs(date, date)],
                "skips": {},
            }

            try:
                test_info = test_scheduling.get_test_info(date)

                for component in test_info["tests"].keys():
                    test_infos[date_str]["skips"][component] = sum(
                        1 for test in test_info["tests"][component]
                        if "skip-if" in test)
            except requests.exceptions.HTTPError:
                # If we couldn't find a test info artifact for the given date, assume the number of skip-ifs didn't change from the previous day.
                assert prev_skips is not None
                test_infos[date_str]["skips"] = prev_skips

            prev_skips = test_infos[date_str]["skips"]

        db.write(
            TEST_INFOS_DB,
            (test_infos[date.strftime("%Y-%m-%d")]
             for date in dates if date.strftime("%Y-%m-%d") in test_infos),
        )
        zstd_compress(TEST_INFOS_DB)

        return test_infos
Example #16
0
def get_test_scheduling_history(granularity):
    if granularity == "label":
        test_scheduling_db = TEST_LABEL_SCHEDULING_DB
    elif granularity == "group":
        test_scheduling_db = TEST_GROUP_SCHEDULING_DB
    else:
        raise Exception(f"{granularity} granularity unsupported")

    return db.read(test_scheduling_db)
Example #17
0
def get_test_scheduling_history(granularity):
    if granularity == "label":
        test_scheduling_db = TEST_LABEL_SCHEDULING_DB
    elif granularity == "group":
        test_scheduling_db = TEST_GROUP_SCHEDULING_DB
    else:
        raise Exception(f"{granularity} granularity unsupported")

    for obj in db.read(test_scheduling_db):
        yield obj["revs"], obj["data"]
Example #18
0
def test_unregistered_db(tmp_path):
    db_path = tmp_path / "prova.json"

    with pytest.raises(AssertionError):
        list(db.read(db_path))

    with pytest.raises(AssertionError):
        db.write(db_path, range(7))

    with pytest.raises(AssertionError):
        db.append(db_path, range(7))
Example #19
0
def test_unregistered_db(tmp_path):
    db_path = tmp_path / "prova.json"

    with pytest.raises(AssertionError):
        list(db.read(db_path))

    with pytest.raises(AssertionError):
        db.write(db_path, range(7))

    with pytest.raises(AssertionError):
        db.append(db_path, range(7))
Example #20
0
def get_commits(
    include_no_bug: bool = False,
    include_backouts: bool = False,
    include_ignored: bool = False,
) -> Generator[CommitDict, None, None]:
    return filter_commits(
        db.read(COMMITS_DB),
        include_no_bug=include_no_bug,
        include_backouts=include_backouts,
        include_ignored=include_ignored,
    )
Example #21
0
    def get_labels(self):
        classes = {}

        regressors = set(
            r["bug_introducing_rev"]
            for r in db.read(BUG_INTRODUCING_COMMITS_DB)
            if r["bug_introducing_rev"]
        )

        regressor_bugs = set(
            sum((bug["regressed_by"] for bug in bugzilla.get_bugs()), [])
        )

        for commit_data in repository.get_commits():
            if commit_data["backedoutby"]:
                continue

            if repository.is_wptsync(commit_data):
                continue

            push_date = dateutil.parser.parse(commit_data["pushdate"])

            # Skip commits used for the evaluation phase.
            if push_date > datetime.utcnow() - relativedelta(months=EVALUATION_MONTHS):
                continue

            node = commit_data["node"]
            if node in regressors or commit_data["bug_id"] in regressor_bugs:
                classes[node] = 1
            else:
                # The labels we have are only from two years and six months ago (see the regressor finder script).
                if push_date < datetime.utcnow() - relativedelta(years=2, months=6):
                    continue

                # We remove the last 6 months, as there could be regressions which haven't been filed yet.
                if push_date > datetime.utcnow() - relativedelta(months=6):
                    continue

                classes[node] = 0

        print(
            "{} commits caused regressions".format(
                sum(1 for label in classes.values() if label == 1)
            )
        )

        print(
            "{} commits did not cause regressions".format(
                sum(1 for label in classes.values() if label == 0)
            )
        )

        return classes, [0, 1]
def go(days: int) -> None:
    logger.info("Download previous shadow scheduler statistics...")
    db.download(SHADOW_SCHEDULER_STATS_DB)

    logger.info("Get previously gathered statistics...")
    prev_scheduler_stat_revs = set(
        scheduler_stat["id"]
        for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB))
    logger.info(
        f"Already gathered statistics for {len(prev_scheduler_stat_revs)} pushes..."
    )

    to_date = datetime.utcnow() - relativedelta(days=3)
    from_date = to_date - relativedelta(days=days)
    pushes = mozci.push.make_push_objects(
        from_date=from_date.strftime("%Y-%m-%d"),
        to_date=to_date.strftime("%Y-%m-%d"),
        branch="autoland",
    )

    pushes = [
        push for push in pushes if push.rev not in prev_scheduler_stat_revs
    ]

    logger.info(f"{len(pushes)} left to analyze")

    def compress_and_upload() -> None:
        utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB)
        db.upload(SHADOW_SCHEDULER_STATS_DB)

    def results() -> Iterator[dict]:
        for i, push in enumerate(tqdm(pushes)):
            try:
                yield analyze_shadow_schedulers(push)
            except Exception:
                traceback.print_exc()

            # Upload every 42 pushes.
            if (i + 1) % 42 == 0:
                compress_and_upload()

    db.append(SHADOW_SCHEDULER_STATS_DB, results())
    compress_and_upload()
Example #23
0
 def push_data_iter() -> Iterator[PushResult]:
     return ((
         revisions,
         filter_runnables(
             rename_runnables(granularity, push_tasks),
             all_runnables_set,
             granularity,
         ),
         filter_runnables(
             rename_runnables(granularity, possible_regressions),
             all_runnables_set,
             granularity,
         ),
         filter_runnables(
             rename_runnables(granularity, likely_regressions),
             all_runnables_set,
             granularity,
         ),
     ) for revisions, push_tasks, possible_regressions, likely_regressions
             in db.read(push_data_db))
Example #24
0
def get_test_scheduling_history():
    return db.read(TEST_SCHEDULING_DB)
Example #25
0
def get_bugs(include_invalid: Optional[bool] = False) -> Iterator[BugDict]:
    yield from (bug for bug in db.read(BUGS_DB)
                if include_invalid or bug["product"] != "Invalid Bugs")
Example #26
0
    def find_bug_introducing_commits(self, bug_fixing_commits,
                                     commits_to_ignore, tokenized):
        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.tokenized_git_repo_dir
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.git_repo_dir

        def git_to_mercurial(rev):
            if tokenized:
                return self.tokenized_git_to_mercurial[rev]
            else:
                return vcs_map.git_to_mercurial(rev)

        def mercurial_to_git(rev):
            if tokenized:
                return self.mercurial_to_tokenized_git[rev]
            else:
                return vcs_map.mercurial_to_git(rev)

        logger.info("Download previously found bug-introducing commits...")
        if db.is_old_version(db_path) or not db.exists(db_path):
            db.download(db_path, force=True)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits)
        logger.info(
            f"Already classified {len(prev_bug_introducing_commits)} commits..."
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            f.writelines("{}\n".format(mercurial_to_git(commit["rev"]))
                         for commit in commits_to_ignore if not tokenized
                         or commit["rev"] in self.mercurial_to_tokenized_git)

        logger.info(f"{len(bug_fixing_commits)} commits to analyze")

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if
            bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
        )

        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash"
            )

        def _init(git_repo_dir):
            thread_local.git = GitRepository(git_repo_dir)

        def find_bic(bug_fixing_commit):
            logger.info("Analyzing {}...".format(bug_fixing_commit["rev"]))

            git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"])

            commit = thread_local.git.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                logger.info("Skipping {} as it is too big".format(
                    bug_fixing_commit["rev"]))
                return None

            bug_introducing_modifications = thread_local.git.get_commits_last_modified_lines(
                commit,
                hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore"))

            logger.info("Found {} for {}".format(bug_introducing_modifications,
                                                 bug_fixing_commit["rev"]))

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values(
            ):
                for bug_introducing_hash in bug_introducing_hashes:
                    try:
                        bug_introducing_commits.append({
                            "bug_fixing_rev":
                            bug_fixing_commit["rev"],
                            "bug_introducing_rev":
                            git_to_mercurial(bug_introducing_hash),
                        })
                    except Exception as e:
                        # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
                        if not str(e).startswith(
                                "Missing git commit in the VCS map"):
                            raise

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append({
                    "bug_fixing_rev":
                    bug_fixing_commit["rev"],
                    "bug_introducing_rev":
                    "",
                })

            return bug_introducing_commits

        with concurrent.futures.ThreadPoolExecutor(initializer=_init,
                                                   initargs=(repo_dir, ),
                                                   max_workers=os.cpu_count() +
                                                   1) as executor:

            def results():
                num_analyzed = 0

                bug_fixing_commits_queue = bug_fixing_commits.copy()

                # Analyze up to 500 commits at a time, to avoid the task running out of time.
                while len(
                        bug_fixing_commits_queue) != 0 and num_analyzed != 500:
                    bug_introducing_commit_futures = []
                    for _ in range(
                            min(500 - num_analyzed, len(bug_fixing_commits))):
                        bug_introducing_commit_futures.append(
                            executor.submit(find_bic,
                                            bug_fixing_commits.pop()))

                    logger.info(
                        f"Analyzing a chunk of {len(bug_introducing_commit_futures)} commits"
                    )

                    for future in tqdm(
                            concurrent.futures.as_completed(
                                bug_introducing_commit_futures),
                            total=len(bug_introducing_commit_futures),
                    ):
                        result = future.result()
                        if result is not None:
                            num_analyzed += 1
                            yield from result

                with open("done", "w") as f:
                    f.write(
                        str(1 if len(bug_fixing_commits_queue) == 0 else 0))

            db.append(db_path, results())

        zstd_compress(db_path)
Example #27
0
    def find_bug_fixing_commits(self):
        logger.info("Downloading commits database...")
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        logger.info("Downloading bugs database...")
        if db.is_old_version(
                bugzilla.BUGS_DB) or not db.exists(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB, force=True)

        logger.info("Download previous classifications...")
        if db.is_old_version(
                BUG_FIXING_COMMITS_DB) or not db.exists(BUG_FIXING_COMMITS_DB):
            db.download(BUG_FIXING_COMMITS_DB, force=True)

        logger.info("Get previously classified commits...")
        prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))
        prev_bug_fixing_commits_nodes = set(
            bug_fixing_commit["rev"]
            for bug_fixing_commit in prev_bug_fixing_commits)
        logger.info(
            f"Already classified {len(prev_bug_fixing_commits)} commits...")

        # TODO: Switch to the pure Defect model, as it's better in this case.
        logger.info("Downloading defect/enhancement/task model...")
        download_model("defectenhancementtask")
        defect_model = DefectEnhancementTaskModel.load(
            "defectenhancementtaskmodel")

        logger.info("Downloading regression model...")
        download_model("regression")
        regression_model = RegressionModel.load("regressionmodel")

        start_date = datetime.now() - RELATIVE_START_DATE
        end_date = datetime.now() - RELATIVE_END_DATE
        logger.info(
            f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
        )
        commit_map = defaultdict(list)
        for commit in repository.get_commits():
            if commit["node"] in prev_bug_fixing_commits_nodes:
                continue

            commit_date = dateutil.parser.parse(commit["pushdate"])
            if commit_date < start_date or commit_date > end_date:
                continue

            commit_map[commit["bug_id"]].append(commit["node"])

        logger.info(
            f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits"
        )
        assert len(commit_map) > 0

        def get_relevant_bugs():
            return (bug for bug in bugzilla.get_bugs()
                    if bug["id"] in commit_map)

        bug_count = sum(1 for bug in get_relevant_bugs())
        logger.info(
            f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing"
        )

        known_defect_labels = defect_model.get_labels()
        known_regression_labels = regression_model.get_labels()

        bug_fixing_commits = []

        def append_bug_fixing_commits(bug_id, type_):
            for commit in commit_map[bug_id]:
                bug_fixing_commits.append({"rev": commit, "type": type_})

        for bug in tqdm(get_relevant_bugs(), total=bug_count):
            # Ignore bugs which are not linked to the commits we care about.
            if bug["id"] not in commit_map:
                continue

            # If we know the label already, we don't need to apply the model.
            if (bug["id"] in known_regression_labels
                    and known_regression_labels[bug["id"]] == 1):
                append_bug_fixing_commits(bug["id"], "r")
                continue

            if bug["id"] in known_defect_labels:
                if known_defect_labels[bug["id"]] == "defect":
                    append_bug_fixing_commits(bug["id"], "d")
                else:
                    append_bug_fixing_commits(bug["id"], "e")
                continue

            if defect_model.classify(bug)[0] == "defect":
                if regression_model.classify(bug)[0] == 1:
                    append_bug_fixing_commits(bug["id"], "r")
                else:
                    append_bug_fixing_commits(bug["id"], "d")
            else:
                append_bug_fixing_commits(bug["id"], "e")

        db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
        zstd_compress(BUG_FIXING_COMMITS_DB)

        bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits
        return [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["type"] in ["r", "d"]
        ]
def go(months: int) -> None:
    logger.info("Download previous shadow scheduler statistics...")
    db.download(SHADOW_SCHEDULER_STATS_DB)

    logger.info("Get previously gathered statistics...")
    scheduler_stats = {
        scheduler_stat["id"]: scheduler_stat
        for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB)
    }
    logger.info(f"Already gathered statistics for {len(scheduler_stats)} pushes...")

    to_date = datetime.utcnow() - relativedelta(days=3)
    from_date = to_date - relativedelta(months=months)
    pushes = mozci.push.make_push_objects(
        from_date=from_date.strftime("%Y-%m-%d"),
        to_date=to_date.strftime("%Y-%m-%d"),
        branch="autoland",
    )

    pushes_to_analyze = [push for push in pushes if push.rev not in scheduler_stats]

    logger.info(f"{len(pushes_to_analyze)} left to analyze")

    def compress_and_upload() -> None:
        db.write(
            SHADOW_SCHEDULER_STATS_DB,
            (
                scheduler_stats[push.rev]
                for push in pushes
                if push.rev in scheduler_stats
            ),
        )

        utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB)
        db.upload(SHADOW_SCHEDULER_STATS_DB)

    assert db.download(test_scheduling.PUSH_DATA_GROUP_DB)
    group_regressions = {}
    for revisions, _, _, possible_regressions, likely_regressions in db.read(
        test_scheduling.PUSH_DATA_GROUP_DB
    ):
        group_regressions[revisions[0]] = set(likely_regressions)

    assert db.download(test_scheduling.PUSH_DATA_CONFIG_GROUP_DB)
    config_group_regressions = {}
    for (
        revisions,
        _,
        _,
        possible_regressions,
        likely_regressions,
    ) in db.read(test_scheduling.PUSH_DATA_CONFIG_GROUP_DB):
        config_group_regressions[revisions[0]] = set(
            tuple(r) for r in likely_regressions
        )

    start_time = time.monotonic()
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_push = {
            executor.submit(
                analyze_shadow_schedulers,
                group_regressions[push.rev] if push.rev in group_regressions else None,
                config_group_regressions[push.rev]
                if push.rev in config_group_regressions
                else None,
                push,
            ): push
            for push in pushes_to_analyze
            if push.rev in group_regressions or push.rev in config_group_regressions
        }

        try:
            for future in tqdm(
                concurrent.futures.as_completed(future_to_push),
                total=len(future_to_push),
            ):
                push = future_to_push[future]

                try:
                    scheduler_stats[push.rev] = future.result()
                except Exception:
                    traceback.print_exc()

                # Upload every 10 minutes.
                if time.monotonic() - start_time >= 600:
                    compress_and_upload()
                    start_time = time.monotonic()
        except Exception:
            for f in future_to_push.keys():
                f.cancel()

            raise

    compress_and_upload()
Example #29
0
def get_issues() -> Iterator[IssueDict]:
    yield from db.read(GITHUB_ISSUES_DB)
Example #30
0
def get_bugs():
    return db.read(BUGS_DB)
Example #31
0
def get_bugs(include_invalid=False):
    yield from (bug for bug in db.read(BUGS_DB)
                if include_invalid or bug["product"] != "Invalid Bugs")
Example #32
0
def get_bugs():
    return db.read(BUGS_DB)
Example #33
0
def get_commits():
    return db.read(COMMITS_DB)
Example #34
0
    def go(self) -> None:
        logger.info(
            "Generate map of bug ID -> bug data for all bugs which were defects"
        )
        bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))

        bug_fixing_commits_nodes = set(
            bug_fixing_commit["rev"]
            for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["type"] in ("d", "r"))

        logger.info(
            f"{len(bug_fixing_commits_nodes)} bug-fixing commits to analyze")

        all_bug_ids = set(commit["bug_id"]
                          for commit in repository.get_commits())

        bug_map = {
            bug["id"]: bug
            for bug in bugzilla.get_bugs() if bug["id"] in all_bug_ids
        }

        logger.info(
            "Generate a map from files/functions to the bugs which were fixed/introduced by touching them"
        )

        # TODO: Support "moving" past bugs between files when they are renamed and between functions when they are
        # moved across files.

        past_regressions_by_file: Dict[str, List[int]] = defaultdict(list)
        past_fixed_bugs_by_file: Dict[str, List[int]] = defaultdict(list)
        past_regression_blocked_bugs_by_file: Dict[
            str, List[int]] = defaultdict(list)
        past_fixed_bug_blocked_bugs_by_file: Dict[
            str, List[int]] = defaultdict(list)
        past_regressions_by_function: Dict[str, Dict[
            str, List[int]]] = defaultdict(lambda: defaultdict(list))
        past_fixed_bugs_by_function: Dict[str, Dict[
            str, List[int]]] = defaultdict(lambda: defaultdict(list))
        past_regression_blocked_bugs_by_function: Dict[str, Dict[
            str, List[int]]] = defaultdict(lambda: defaultdict(list))
        past_fixed_bug_blocked_bugs_by_function: Dict[str, Dict[
            str, List[int]]] = defaultdict(lambda: defaultdict(list))

        for commit in tqdm(repository.get_commits()):
            if commit["bug_id"] not in bug_map:
                continue

            bug = bug_map[commit["bug_id"]]

            if len(bug["regressions"]) > 0:
                for path in commit["files"]:
                    past_regressions_by_file[path].extend(
                        bug_id for bug_id in bug["regressions"]
                        if bug_id in bug_map)

                    past_regression_blocked_bugs_by_file[path].extend(
                        bugzilla.find_blocked_by(bug_map, bug))

                for path, f_group in commit["functions"].items():
                    for f in f_group:
                        past_regressions_by_function[path][f[0]].extend(
                            bug_id for bug_id in bug["regressions"]
                            if bug_id in bug_map)

                        past_regression_blocked_bugs_by_function[path][
                            f[0]].extend(bugzilla.find_blocked_by(
                                bug_map, bug))

            if commit["node"] in bug_fixing_commits_nodes:
                for path in commit["files"]:
                    past_fixed_bugs_by_file[path].append(bug["id"])

                    past_fixed_bug_blocked_bugs_by_file[path].extend(
                        bugzilla.find_blocked_by(bug_map, bug))

                for path, f_group in commit["functions"].items():
                    for f in f_group:
                        past_fixed_bugs_by_function[path][f[0]].append(
                            bug["id"])

                        past_fixed_bug_blocked_bugs_by_function[path][
                            f[0]].extend(bugzilla.find_blocked_by(
                                bug_map, bug))

        def _transform(bug_ids: List[int]) -> List[dict]:
            seen = set()
            results = []
            for bug_id in bug_ids:
                if bug_id in seen:
                    continue
                seen.add(bug_id)

                bug = bug_map[bug_id]
                results.append({
                    "id":
                    bug_id,
                    "summary":
                    bug["summary"],
                    "component":
                    "{}::{}".format(bug["product"], bug["component"]),
                })

            return results

        past_regression_summaries_by_file = {
            path: _transform(bug_ids)
            for path, bug_ids in past_regressions_by_file.items()
        }
        past_fixed_bug_summaries_by_file = {
            path: _transform(bug_ids)
            for path, bug_ids in past_fixed_bugs_by_file.items()
        }
        past_regression_blocked_bug_summaries_by_file = {
            path: _transform(bug_ids)
            for path, bug_ids in past_regression_blocked_bugs_by_file.items()
        }
        past_fixed_bug_blocked_bug_summaries_by_file = {
            path: _transform(bug_ids)
            for path, bug_ids in past_fixed_bug_blocked_bugs_by_file.items()
        }
        past_regression_summaries_by_function = {
            path: {
                func: _transform(bug_ids)
                for func, bug_ids in funcs_bugs.items()
            }
            for path, funcs_bugs in past_regressions_by_function.items()
        }
        past_fixed_bug_summaries_by_function = {
            path: {
                func: _transform(bug_ids)
                for func, bug_ids in funcs_bugs.items()
            }
            for path, funcs_bugs in past_fixed_bugs_by_function.items()
        }
        past_regression_blocked_bug_summaries_by_function = {
            path: {
                func: _transform(bug_ids)
                for func, bug_ids in funcs_bugs.items()
            }
            for path, funcs_bugs in
            past_regression_blocked_bugs_by_function.items()
        }
        past_fixed_bug_blocked_bug_summaries_by_function = {
            path: {
                func: _transform(bug_ids)
                for func, bug_ids in funcs_bugs.items()
            }
            for path, funcs_bugs in
            past_fixed_bug_blocked_bugs_by_function.items()
        }

        with open("data/past_regressions_by_file.json", "w") as f:
            json.dump(past_regression_summaries_by_file, f)
        zstd_compress("data/past_regressions_by_file.json")

        with open("data/past_fixed_bugs_by_file.json", "w") as f:
            json.dump(past_fixed_bug_summaries_by_file, f)
        zstd_compress("data/past_fixed_bugs_by_file.json")

        with open("data/past_regression_blocked_bugs_by_file.json", "w") as f:
            json.dump(past_regression_blocked_bug_summaries_by_file, f)
        zstd_compress("data/past_regression_blocked_bugs_by_file.json")

        with open("data/past_fixed_bug_blocked_bugs_by_file.json", "w") as f:
            json.dump(past_fixed_bug_blocked_bug_summaries_by_file, f)
        zstd_compress("data/past_fixed_bug_blocked_bugs_by_file.json")

        with open("data/past_regressions_by_function.json", "w") as f:
            json.dump(past_regression_summaries_by_function, f)
        zstd_compress("data/past_regressions_by_function.json")

        with open("data/past_fixed_bugs_by_function.json", "w") as f:
            json.dump(past_fixed_bug_summaries_by_function, f)
        zstd_compress("data/past_fixed_bugs_by_function.json")

        with open("data/past_regression_blocked_bugs_by_function.json",
                  "w") as f:
            json.dump(past_regression_blocked_bug_summaries_by_function, f)
        zstd_compress("data/past_regression_blocked_bugs_by_function.json")

        with open("data/past_fixed_bug_blocked_bugs_by_function.json",
                  "w") as f:
            json.dump(past_fixed_bug_blocked_bug_summaries_by_function, f)
        zstd_compress("data/past_fixed_bug_blocked_bugs_by_function.json")