Ejemplo n.º 1
0
def test_download_version(tmp_path):
    url_zst = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.zst"
    url_version = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.version"

    db_path = tmp_path / "prova.json"
    db.register(db_path, url_zst, 1, support_files=[])

    responses.add(responses.HEAD,
                  url_version,
                  status=200,
                  headers={"ETag": "123"})

    responses.add(responses.GET, url_version, status=200, body="42")

    db.download_version(db_path)

    assert os.path.exists(db_path.with_suffix(db_path.suffix + ".version"))
    assert os.path.exists(db_path.with_suffix(db_path.suffix +
                                              ".version.etag"))

    assert not db.is_old_version(db_path)

    db.register(db_path, url_zst, 43, support_files=[])

    assert db.is_old_version(db_path)
Ejemplo n.º 2
0
def test_is_old_version(tmp_path):
    url_zst = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.zst"
    url_version = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.version"

    db_path = tmp_path / "prova.json"
    db.register(db_path, url_zst, 1, support_files=[])

    assert os.path.exists(db_path.with_suffix(db_path.suffix + ".version"))

    responses.add(responses.GET, url_version, status=404)
    responses.add(responses.GET, url_version, status=424)
    responses.add(responses.GET, url_version, status=200, body="1")
    responses.add(responses.GET, url_version, status=200, body="42")

    # When the remote version file doesn't exist, we consider the db as being old.
    assert db.is_old_version(db_path)

    # When the remote version file doesn't exist, we consider the db as being old.
    assert db.is_old_version(db_path)

    # When the remote version file exists and returns the same version as the current db, we consider the remote db as not being old.
    assert not db.is_old_version(db_path)

    # When the remote version file exists and returns a newer version than the current db, we consider the remote db as not being old.
    assert not db.is_old_version(db_path)

    db.register(db_path, url_zst, 43, support_files=[])

    # When the remote version file exists and returns an older version than the current db, we consider the remote db as being old.
    assert db.is_old_version(db_path)
Ejemplo n.º 3
0
def test_register_db(tmp_path):
    db_path = tmp_path / "prova.json"

    db.register(db_path, "https://alink", 1)

    assert not db.is_old_version(db_path)

    db.register(db_path, "https://alink", 2)

    assert db.is_old_version(db_path)
Ejemplo n.º 4
0
    def get_commits_to_ignore(self):
        logger.info("Download previous commits to ignore...")
        if db.is_old_version(
                IGNORED_COMMITS_DB) or not db.exists(IGNORED_COMMITS_DB):
            db.download(IGNORED_COMMITS_DB, force=True)

        logger.info("Get previously classified commits...")
        prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB))
        logger.info(
            f"Already found {len(prev_commits_to_ignore)} commits to ignore..."
        )

        if len(prev_commits_to_ignore) > 0:
            rev_start = "children({})".format(
                prev_commits_to_ignore[-1]["rev"])
        else:
            rev_start = 0

        # 2 days more than the end date, so we can know if a commit was backed-out.
        # We have to do this as recent commits might be missing in the mercurial <-> git map,
        # otherwise we could just use "tip".
        end_date = datetime.now() - RELATIVE_END_DATE + relativedelta(2)
        with hglib.open(self.mercurial_repo_dir) as hg:
            revs = repository.get_revs(
                hg, rev_start,
                "pushdate('{}')".format(end_date.strftime("%Y-%m-%d")))

        # Given that we use the pushdate, there might be cases where the starting commit is returned too (e.g. if we rerun the task on the same day).
        if len(prev_commits_to_ignore) > 0:
            found_prev = -1
            for i, rev in enumerate(revs):
                if rev.decode("utf-8") == prev_commits_to_ignore[-1]["rev"]:
                    found_prev = i
                    break
            revs = revs[found_prev + 1:]

        commits = repository.hg_log_multi(self.mercurial_repo_dir, revs)

        repository.set_commits_to_ignore(self.mercurial_repo_dir, commits)
        commits_to_ignore = []

        for commit in commits:
            if commit.ignored or commit.backedoutby:
                commits_to_ignore.append({
                    "rev":
                    commit.node,
                    "type":
                    "backedout" if commit.backedoutby else "",
                })

        logger.info(f"{len(commits_to_ignore)} new commits to ignore...")

        logger.info("...of which {} are backed-out".format(
            sum(1 for commit in commits_to_ignore
                if commit["type"] == "backedout")))

        db.append(IGNORED_COMMITS_DB, commits_to_ignore)
        zstd_compress(IGNORED_COMMITS_DB)

        return prev_commits_to_ignore + commits_to_ignore
Ejemplo n.º 5
0
    def update_commit_db(self):
        repository.clone(self.repo_dir)

        if db.is_old_version(repository.COMMITS_DB) or not db.exists(
            repository.COMMITS_DB
        ):
            db.download(repository.COMMITS_DB, force=True, support_files_too=True)

        for commit in repository.get_commits():
            pass

        rev_start = "children({})".format(commit["node"])

        repository.download_commits(self.repo_dir, rev_start)
Ejemplo n.º 6
0
    def retrieve_commits(self):
        shared_dir = self.repo_dir + "-shared"
        cmd = hglib.util.cmdbuilder(
            "robustcheckout",
            "https://hg.mozilla.org/mozilla-central",
            self.repo_dir,
            purge=True,
            sharebase=shared_dir,
            networkattempts=7,
            branch=b"tip",
        )

        cmd.insert(0, hglib.HGPATH)

        proc = hglib.util.popen(cmd)
        out, err = proc.communicate()
        if proc.returncode:
            raise hglib.error.CommandError(cmd, proc.returncode, out, err)

        logger.info("mozilla-central cloned")

        try:
            os.remove(os.path.join(self.repo_dir, ".hg", "pushlog2.db"))
        except FileNotFoundError:
            logger.info("pushlog database doesn't exist")

        # Pull and update, to make sure the pushlog is generated.
        hg = hglib.open(self.repo_dir)
        hg.pull(update=True)
        hg.close()

        db.download_version(repository.COMMITS_DB)
        if not db.is_old_version(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, support_files_too=True)

            for commit in repository.get_commits():
                pass

            rev_start = f"children({commit['node']})"
        else:
            rev_start = 0

        repository.download_commits(self.repo_dir, rev_start)

        logger.info("commit data extracted from repository")

        self.compress_file("data/commits.json")
        self.compress_file("data/commit_experiences.pickle")
    def retrieve_push_data(self):
        # Download previous cache.
        cache_path = os.path.splitext(ADR_CACHE_DB)[0]
        if not db.is_old_version(ADR_CACHE_DB):
            db.download(ADR_CACHE_DB)
            if os.path.exists(ADR_CACHE_DB):
                with tarfile.open(ADR_CACHE_DB, "r") as tar:
                    tar.extractall()
                assert os.path.exists(
                    cache_path), "Decompressed adr cache exists"

        # Setup adr cache configuration.
        os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True)
        with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f:
            f.write(f"""[adr.cache.stores]
file = {{ driver = "file", path = "{os.path.abspath(cache_path)}" }}
""")

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use 3 months more than that to calculate the failure statistics.
        subprocess.run(
            [
                "run-adr",
                "ahal/ci-recipes",
                "recipe",
                "-o",
                os.path.abspath("push_data.json"),
                "-f",
                "json",
                "push_data",
                "--",
                "--from",
                f"today-{TRAINING_MONTHS + 3}month",
                "--to",
                "today-2day",
                "--branch",
                "autoland",
            ],
            check=True,
            stdout=subprocess.
            DEVNULL,  # Redirect to /dev/null, as the logs are too big otherwise.
        )

        with tarfile.open(ADR_CACHE_DB, "w") as tar:
            tar.add(cache_path)
        zstd_compress(ADR_CACHE_DB)

        zstd_compress("push_data.json")
Ejemplo n.º 8
0
    def __init__(self):
        self.model_class = RegressorModel

        self.repo_dir = get_login_info()["repo_dir"]

        if not os.path.exists(self.repo_dir):
            cmd = hglib.util.cmdbuilder(
                "robustcheckout",
                "https://hg.mozilla.org/mozilla-central",
                self.repo_dir,
                purge=True,
                sharebase=self.repo_dir + "-shared",
                networkattempts=7,
                branch=b"tip",
            )

            cmd.insert(0, hglib.HGPATH)

            proc = hglib.util.popen(cmd)
            out, err = proc.communicate()
            if proc.returncode:
                raise hglib.error.CommandError(cmd, proc.returncode, out, err)

            logger.info("mozilla-central cloned")

            # Remove pushlog DB to make sure it's regenerated.
            try:
                os.remove(os.path.join(self.repo_dir, ".hg", "pushlog2.db"))
            except FileNotFoundError:
                logger.info("pushlog database doesn't exist")

        logger.info("Pulling and updating mozilla-central")
        with hglib.open(self.repo_dir) as hg:
            hg.pull(update=True)
        logger.info("mozilla-central pulled and updated")

        db.download_version(repository.COMMITS_DB)
        if db.is_old_version(repository.COMMITS_DB) or not os.path.exists(
                repository.COMMITS_DB):
            db.download(repository.COMMITS_DB,
                        force=True,
                        support_files_too=True)

        super().__init__()
        self.model = self.model_class.load(self.retrieve_model())
Ejemplo n.º 9
0
def main():
    args = parse_args(sys.argv[1:])

    logger.info("Downloading bugs database...")

    if db.is_old_version(bugzilla.BUGS_DB) or not db.exists(bugzilla.BUGS_DB):
        db.download(bugzilla.BUGS_DB, force=True)

    if args.algorithm == "neighbors_tfidf_bigrams":
        model = similarity.model_name_to_class[args.algorithm](
            vectorizer=TfidfVectorizer(ngram_range=(1, 2)),
            cleanup_urls=args.cleanup_urls,
            nltk_tokenizer=args.nltk_tokenizer,
        )
    else:
        model = similarity.model_name_to_class[args.algorithm](
            cleanup_urls=args.cleanup_urls, nltk_tokenizer=args.nltk_tokenizer)

    model.save()
Ejemplo n.º 10
0
    def retrieve_commits(self):
        repository.clone(self.repo_dir)

        if not db.is_old_version(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, support_files_too=True)

            for commit in repository.get_commits():
                pass

            rev_start = f"children({commit['node']})"
        else:
            rev_start = 0

        repository.download_commits(self.repo_dir, rev_start)

        logger.info("commit data extracted from repository")

        self.compress_file("data/commits.json")
        self.compress_file("data/commit_experiences.pickle")
Ejemplo n.º 11
0
    def retrieve_commits(self, limit):
        repository.clone(self.repo_dir)

        if not db.is_old_version(repository.COMMITS_DB) and not limit:
            db.download(repository.COMMITS_DB, support_files_too=True)

            for commit in repository.get_commits():
                pass

            rev_start = f"children({commit['node']})"
        else:
            if limit:
                # Mercurial revset supports negative integers starting from tip
                rev_start = -limit
            else:
                rev_start = 0

        repository.download_commits(self.repo_dir, rev_start)

        logger.info("commit data extracted from repository")

        zstd_compress("data/commits.json")
        zstd_compress("data/commit_experiences.pickle")
Ejemplo n.º 12
0
    def retrieve_bugs(self):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download_version(bugzilla.BUGS_DB)
        if not db.is_old_version(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids({
            "f1": "delta_ts",
            "o1": "greaterthaneq",
            "v1": last_modified.date()
        })
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago,
                                                six_months_ago)
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        all_ids = set(timespan_ids + labelled_bug_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(
            lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids)

        bugzilla.download_bugs(timespan_ids + labelled_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs()
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        self.compress_file("data/bugs.json")
Ejemplo n.º 13
0
    def find_bug_introducing_commits(self, bug_fixing_commits,
                                     commits_to_ignore, tokenized):
        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.tokenized_git_repo_dir
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.git_repo_dir

        def git_to_mercurial(rev):
            if tokenized:
                return self.tokenized_git_to_mercurial[rev]
            else:
                return vcs_map.git_to_mercurial(rev)

        def mercurial_to_git(rev):
            if tokenized:
                return self.mercurial_to_tokenized_git[rev]
            else:
                return vcs_map.mercurial_to_git(rev)

        logger.info("Download previously found bug-introducing commits...")
        if db.is_old_version(db_path) or not db.exists(db_path):
            db.download(db_path, force=True)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits)
        logger.info(
            f"Already classified {len(prev_bug_introducing_commits)} commits..."
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            f.writelines("{}\n".format(mercurial_to_git(commit["rev"]))
                         for commit in commits_to_ignore if not tokenized
                         or commit["rev"] in self.mercurial_to_tokenized_git)

        logger.info(f"{len(bug_fixing_commits)} commits to analyze")

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if
            bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
        )

        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash"
            )

        def _init(git_repo_dir):
            thread_local.git = GitRepository(git_repo_dir)

        def find_bic(bug_fixing_commit):
            logger.info("Analyzing {}...".format(bug_fixing_commit["rev"]))

            git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"])

            commit = thread_local.git.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                logger.info("Skipping {} as it is too big".format(
                    bug_fixing_commit["rev"]))
                return None

            bug_introducing_modifications = thread_local.git.get_commits_last_modified_lines(
                commit,
                hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore"))

            logger.info("Found {} for {}".format(bug_introducing_modifications,
                                                 bug_fixing_commit["rev"]))

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values(
            ):
                for bug_introducing_hash in bug_introducing_hashes:
                    try:
                        bug_introducing_commits.append({
                            "bug_fixing_rev":
                            bug_fixing_commit["rev"],
                            "bug_introducing_rev":
                            git_to_mercurial(bug_introducing_hash),
                        })
                    except Exception as e:
                        # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
                        if not str(e).startswith(
                                "Missing git commit in the VCS map"):
                            raise

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append({
                    "bug_fixing_rev":
                    bug_fixing_commit["rev"],
                    "bug_introducing_rev":
                    "",
                })

            return bug_introducing_commits

        with concurrent.futures.ThreadPoolExecutor(initializer=_init,
                                                   initargs=(repo_dir, ),
                                                   max_workers=os.cpu_count() +
                                                   1) as executor:

            def results():
                num_analyzed = 0

                bug_fixing_commits_queue = bug_fixing_commits.copy()

                # Analyze up to 500 commits at a time, to avoid the task running out of time.
                while len(
                        bug_fixing_commits_queue) != 0 and num_analyzed != 500:
                    bug_introducing_commit_futures = []
                    for _ in range(
                            min(500 - num_analyzed, len(bug_fixing_commits))):
                        bug_introducing_commit_futures.append(
                            executor.submit(find_bic,
                                            bug_fixing_commits.pop()))

                    logger.info(
                        f"Analyzing a chunk of {len(bug_introducing_commit_futures)} commits"
                    )

                    for future in tqdm(
                            concurrent.futures.as_completed(
                                bug_introducing_commit_futures),
                            total=len(bug_introducing_commit_futures),
                    ):
                        result = future.result()
                        if result is not None:
                            num_analyzed += 1
                            yield from result

                with open("done", "w") as f:
                    f.write(
                        str(1 if len(bug_fixing_commits_queue) == 0 else 0))

            db.append(db_path, results())

        zstd_compress(db_path)
Ejemplo n.º 14
0
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB,
                        support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["revs"][0]
        else:
            last_node = None

        past_failures = shelve.Shelf(
            LMDBDict("data/past_failures.lmdb"),
            protocol=pickle.HIGHEST_PROTOCOL,
            writeback=True,
        )

        push_num = past_failures[
            "push_num"] if "push_num" in past_failures else 0

        def get_and_update_past_failures(type_, task, items, push_num,
                                         is_regression):
            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            key = f"{type_}${task}$"

            for item in items:
                full_key = key + item

                if full_key not in past_failures:
                    cur = past_failures[full_key] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0)
                else:
                    cur = past_failures[full_key]

                value = cur[push_num]

                values_total.append(value)
                values_prev_7.append(value - cur[push_num - 7])
                values_prev_14.append(value - cur[push_num - 14])
                values_prev_28.append(value - cur[push_num - 28])
                values_prev_56.append(value - cur[push_num - 56])

                if is_regression:
                    cur[push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_tasks = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            with open("push_data.json", "r") as f:
                push_data = json.load(f)[1:]

            logger.info(f"push data nodes: {len(push_data)}")

            # In the last 28 pushes, we definitely run all possible tasks.
            all_tasks_set = set(
                sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]),
                    []))
            # Filter tasks we don't need.
            all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set)
            all_tasks_set = set(all_tasks)
            logger.info(
                f"{len(all_tasks_set)} tasks run in the last 28 pushes")

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_tasks,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision) for revision in revisions
                    if revision in commit_map)
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 20:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_tasks, we'd generate a huge amount of data.
                # So we consider only the tasks which run in this push, and the possible and likely regressions
                # from this push.
                tasks_to_consider = list(
                    set(push_tasks + possible_regressions +
                        likely_regressions))
                tasks_to_consider = filter_tasks(tasks_to_consider,
                                                 all_tasks_set)

                if len(tasks_to_consider) == 0:
                    skipped_no_tasks += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                for task in tasks_to_consider:
                    is_regression = (task in possible_regressions
                                     or task in likely_regressions)

                    (
                        total_failures,
                        past_7_pushes_failures,
                        past_14_pushes_failures,
                        past_28_pushes_failures,
                        past_56_pushes_failures,
                    ) = get_and_update_past_failures("all", task, ["all"],
                                                     push_num, is_regression)

                    (
                        total_types_failures,
                        past_7_pushes_types_failures,
                        past_14_pushes_types_failures,
                        past_28_pushes_types_failures,
                        past_56_pushes_types_failures,
                    ) = get_and_update_past_failures("type", task,
                                                     merged_commits["types"],
                                                     push_num, is_regression)

                    (
                        total_files_failures,
                        past_7_pushes_files_failures,
                        past_14_pushes_files_failures,
                        past_28_pushes_files_failures,
                        past_56_pushes_files_failures,
                    ) = get_and_update_past_failures("file", task,
                                                     merged_commits["files"],
                                                     push_num, is_regression)

                    (
                        total_directories_failures,
                        past_7_pushes_directories_failures,
                        past_14_pushes_directories_failures,
                        past_28_pushes_directories_failures,
                        past_56_pushes_directories_failures,
                    ) = get_and_update_past_failures(
                        "directory",
                        task,
                        merged_commits["directories"],
                        push_num,
                        is_regression,
                    )

                    (
                        total_components_failures,
                        past_7_pushes_components_failures,
                        past_14_pushes_components_failures,
                        past_28_pushes_components_failures,
                        past_56_pushes_components_failures,
                    ) = get_and_update_past_failures(
                        "component",
                        task,
                        merged_commits["components"],
                        push_num,
                        is_regression,
                    )

                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(i)

                        yield {
                            "revs": revisions,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types":
                            past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types":
                            past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types":
                            past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types":
                            past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files":
                            past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files":
                            past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files":
                            past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files":
                            past_56_pushes_files_failures,
                            "failures_in_directories":
                            total_directories_failures,
                            "failures_past_7_pushes_in_directories":
                            past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories":
                            past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories":
                            past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories":
                            past_56_pushes_directories_failures,
                            "failures_in_components":
                            total_components_failures,
                            "failures_past_7_pushes_in_components":
                            past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components":
                            past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components":
                            past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components":
                            past_56_pushes_components_failures,
                            "is_possible_regression": task
                            in possible_regressions,
                            "is_likely_regression": task in likely_regressions,
                        }

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        past_failures["push_num"] = push_num
        past_failures.close()
        with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar:
            tar.add("data/past_failures.lmdb")
Ejemplo n.º 15
0
    def retrieve_test_scheduling_history(self):
        os.makedirs("data", exist_ok=True)

        # Download previous cache.
        cache_path = os.path.abspath("data/adr_cache")
        if not os.path.exists(cache_path):
            try:
                download_check_etag(URL, "adr_cache.tar.xz")
                with tarfile.open("adr_cache.tar.xz", "r:xz") as tar:
                    tar.extractall()
                assert os.path.exists(
                    "data/adr_cache"), "Decompressed adr cache exists"
            except requests.exceptions.HTTPError:
                logger.info("The adr cache is not available yet")

        # Setup adr cache configuration.
        os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True)
        with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f:
            f.write(f"""[adr.cache.stores]
file = {{ driver = "file", path = "{cache_path}" }}
            """)

        # Get the commits DB.
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use 3 months more than that to calculate the failure statistics.
        subprocess.run(
            [
                "run-adr",
                "ahal/ci-recipes",
                "recipe",
                "-o",
                os.path.abspath("push_data.json"),
                "-f",
                "json",
                "push_data",
                "--",
                "--from",
                f"today-{TRAINING_MONTHS + 3}month",
                "--to",
                "today-2day",
                "--branch",
                "autoland",
            ],
            check=True,
            stdout=subprocess.
            DEVNULL,  # Redirect to /dev/null, as the logs are too big otherwise.
        )

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        HISTORICAL_TIMESPAN = 56

        past_failures = {}

        def get_past_failures(task, push_num):
            if task not in past_failures:
                past_failures[task] = repository.exp_queue(
                    push_num, HISTORICAL_TIMESPAN + 1, 0)

            return past_failures[task][push_num]

        def generate_data():
            commits_with_data = set()
            saved_nodes = set()

            push_num = 0
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
                        continue

                    total_failures = get_past_failures(task, push_num)
                    past_7_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 7)
                    past_14_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 14)
                    past_28_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 28)
                    past_56_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 56)

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "is_possible_regression": task
                            in commit_push_data[1],
                            "is_likely_regression": task
                            in commit_push_data[2],
                        }

                    if task in commit_push_data[1] or task in commit_push_data[
                            2]:
                        past_failures[task][push_num] = total_failures + 1

                push_num += 1

            logger.info(f"push data nodes: {len(push_data)}")

            logger.info(
                f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.write(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        with tarfile.open("data/adr_cache.tar.xz", "w:xz") as tar:
            tar.add("data/adr_cache")
Ejemplo n.º 16
0
    def find_bug_fixing_commits(self):
        logger.info("Downloading commits database...")
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        logger.info("Downloading bugs database...")
        if db.is_old_version(
                bugzilla.BUGS_DB) or not db.exists(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB, force=True)

        logger.info("Download previous classifications...")
        if db.is_old_version(
                BUG_FIXING_COMMITS_DB) or not db.exists(BUG_FIXING_COMMITS_DB):
            db.download(BUG_FIXING_COMMITS_DB, force=True)

        logger.info("Get previously classified commits...")
        prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))
        prev_bug_fixing_commits_nodes = set(
            bug_fixing_commit["rev"]
            for bug_fixing_commit in prev_bug_fixing_commits)
        logger.info(
            f"Already classified {len(prev_bug_fixing_commits)} commits...")

        # TODO: Switch to the pure Defect model, as it's better in this case.
        logger.info("Downloading defect/enhancement/task model...")
        download_model("defectenhancementtask")
        defect_model = DefectEnhancementTaskModel.load(
            "defectenhancementtaskmodel")

        logger.info("Downloading regression model...")
        download_model("regression")
        regression_model = RegressionModel.load("regressionmodel")

        start_date = datetime.now() - RELATIVE_START_DATE
        end_date = datetime.now() - RELATIVE_END_DATE
        logger.info(
            f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
        )
        commit_map = defaultdict(list)
        for commit in repository.get_commits():
            if commit["node"] in prev_bug_fixing_commits_nodes:
                continue

            commit_date = dateutil.parser.parse(commit["pushdate"])
            if commit_date < start_date or commit_date > end_date:
                continue

            commit_map[commit["bug_id"]].append(commit["node"])

        logger.info(
            f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits"
        )
        assert len(commit_map) > 0

        def get_relevant_bugs():
            return (bug for bug in bugzilla.get_bugs()
                    if bug["id"] in commit_map)

        bug_count = sum(1 for bug in get_relevant_bugs())
        logger.info(
            f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing"
        )

        known_defect_labels = defect_model.get_labels()
        known_regression_labels = regression_model.get_labels()

        bug_fixing_commits = []

        def append_bug_fixing_commits(bug_id, type_):
            for commit in commit_map[bug_id]:
                bug_fixing_commits.append({"rev": commit, "type": type_})

        for bug in tqdm(get_relevant_bugs(), total=bug_count):
            # Ignore bugs which are not linked to the commits we care about.
            if bug["id"] not in commit_map:
                continue

            # If we know the label already, we don't need to apply the model.
            if (bug["id"] in known_regression_labels
                    and known_regression_labels[bug["id"]] == 1):
                append_bug_fixing_commits(bug["id"], "r")
                continue

            if bug["id"] in known_defect_labels:
                if known_defect_labels[bug["id"]] == "defect":
                    append_bug_fixing_commits(bug["id"], "d")
                else:
                    append_bug_fixing_commits(bug["id"], "e")
                continue

            if defect_model.classify(bug)[0] == "defect":
                if regression_model.classify(bug)[0] == 1:
                    append_bug_fixing_commits(bug["id"], "r")
                else:
                    append_bug_fixing_commits(bug["id"], "d")
            else:
                append_bug_fixing_commits(bug["id"], "e")

        db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
        zstd_compress(BUG_FIXING_COMMITS_DB)

        bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits
        return [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["type"] in ["r", "d"]
        ]
Ejemplo n.º 17
0
    def retrieve_bugs(self, limit=None):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        if not db.is_old_version(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids({
            "f1": "delta_ts",
            "o1": "greaterthaneq",
            "v1": last_modified.date()
        })
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago,
                                                six_months_ago)
        if limit:
            timespan_ids = timespan_ids[:limit]
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        if limit:
            labelled_bug_ids = labelled_bug_ids[:limit]
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
        start_date = datetime.now() - relativedelta(years=2, months=6)
        commit_bug_ids = [
            commit["bug_id"] for commit in repository.get_commits()
            if commit["bug_id"]
            and dateutil.parser.parse(commit["pushdate"]) >= start_date
        ]
        if limit:
            commit_bug_ids = commit_bug_ids[-limit:]
        logger.info(
            f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which caused regressions fixed by commits (useful for the regressor model).
        regressed_by_bug_ids = sum(
            [
                bug["regressed_by"]
                for bug in bugzilla.get_bugs() if bug["id"] in commit_bug_ids
            ],
            [],
        )
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids +
                   regressed_by_bug_ids)

        all_ids = timespan_ids + labelled_bug_ids + commit_bug_ids
        all_ids_set = set(all_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"]
                             not in all_ids_set)

        bugzilla.download_bugs(all_ids)

        # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs).
        regressed_by_bug_ids = sum(
            [
                bug["regressed_by"]
                for bug in bugzilla.get_bugs() if bug["id"] in commit_bug_ids
            ],
            [],
        )
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        bugzilla.download_bugs(regressed_by_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs()
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        zstd_compress("data/bugs.json")
Ejemplo n.º 18
0
    def find_bug_introducing_commits(
        self, bug_fixing_commits, commits_to_ignore, tokenized
    ):
        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.tokenized_git_repo_dir
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.git_repo_dir

        def git_to_mercurial(rev):
            if tokenized:
                return self.tokenized_git_to_mercurial[rev]
            else:
                return vcs_map.git_to_mercurial(rev)

        def mercurial_to_git(rev):
            if tokenized:
                return self.mercurial_to_tokenized_git[rev]
            else:
                return vcs_map.mercurial_to_git(rev)

        logger.info("Download previously found bug-introducing commits...")
        if db.is_old_version(db_path) or not db.exists(db_path):
            db.download(db_path, force=True)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits
        )
        logger.info(
            f"Already classified {len(prev_bug_introducing_commits)} commits..."
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            f.writelines(
                "{}\n".format(mercurial_to_git(commit["rev"]))
                for commit in commits_to_ignore
                if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git
            )

        logger.info(f"{len(bug_fixing_commits)} commits to analyze")

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit
            for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
        )

        bug_fixing_commits = [
            bug_fixing_commit
            for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit
                for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash"
            )

        # Analyze up to 500 commits at a time, to avoid the task running out of time.
        done = True
        if len(bug_fixing_commits) > 500:
            bug_fixing_commits = bug_fixing_commits[-500:]
            done = False

        with open("done", "w") as f:
            f.write(str(1 if done else 0))

        def _init(git_repo_dir):
            global GIT_REPO
            GIT_REPO = GitRepository(git_repo_dir)

        def find_bic(bug_fixing_commit):
            logger.info("Analyzing {}...".format(bug_fixing_commit["rev"]))

            git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"])

            commit = GIT_REPO.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                return [None]

            bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines(
                commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")
            )
            logger.info(
                "Found {} for {}".format(
                    bug_introducing_modifications, bug_fixing_commit["rev"]
                )
            )

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values():
                for bug_introducing_hash in bug_introducing_hashes:
                    try:
                        bug_introducing_commits.append(
                            {
                                "bug_fixing_rev": bug_fixing_commit["rev"],
                                "bug_introducing_rev": git_to_mercurial(
                                    bug_introducing_hash
                                ),
                            }
                        )
                    except Exception as e:
                        # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
                        if not str(e).startswith("Missing git commit in the VCS map"):
                            raise

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append(
                    {
                        "bug_fixing_rev": bug_fixing_commit["rev"],
                        "bug_introducing_rev": "",
                    }
                )

            return bug_introducing_commits

        with concurrent.futures.ThreadPoolExecutor(
            initializer=_init, initargs=(repo_dir,), max_workers=os.cpu_count() + 1
        ) as executor:
            bug_introducing_commits = executor.map(find_bic, bug_fixing_commits)
            bug_introducing_commits = tqdm(
                bug_introducing_commits, total=len(bug_fixing_commits)
            )
            bug_introducing_commits = list(
                itertools.chain.from_iterable(bug_introducing_commits)
            )

        total_results_num = len(bug_introducing_commits)
        bug_introducing_commits = list(filter(None, bug_introducing_commits))
        logger.info(
            f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
        )

        db.append(db_path, bug_introducing_commits)
        compress_file(db_path)
Ejemplo n.º 19
0
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        logger.info(f"push data nodes: {len(push_data)}")

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB,
                        support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["rev"]
        else:
            last_node = None

        past_failures = shelve.open(
            "data/past_failures.shelve",
            protocol=pickle.HIGHEST_PROTOCOL,
            writeback=True,
        )

        push_num = past_failures[
            "push_num"] if "push_num" in past_failures else 0

        def get_and_update_past_failures(type_, task, items, push_num,
                                         is_regression):
            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            key = f"{type_}${task}$"

            for item in items:
                full_key = key + item

                if full_key not in past_failures:
                    cur = past_failures[full_key] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0)
                else:
                    cur = past_failures[full_key]

                value = cur[push_num]

                values_total.append(value)
                values_prev_7.append(value - cur[push_num - 7])
                values_prev_14.append(value - cur[push_num - 14])
                values_prev_28.append(value - cur[push_num - 28])
                values_prev_56.append(value - cur[push_num - 56])

                if is_regression:
                    cur[push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            commits_with_data = set()
            saved_nodes = set()

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                # Sync DB every 1000 commits, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if len(commits_with_data) % 1000 == 0:
                    past_failures.sync()

                if node == last_node:
                    can_start = True
                    continue

                if not can_start:
                    continue

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
                        continue

                    is_regression = (task in commit_push_data[1]
                                     or task in commit_push_data[2])

                    (
                        total_failures,
                        past_7_pushes_failures,
                        past_14_pushes_failures,
                        past_28_pushes_failures,
                        past_56_pushes_failures,
                    ) = get_and_update_past_failures("all", task, ["all"],
                                                     push_num, is_regression)

                    (
                        total_types_failures,
                        past_7_pushes_types_failures,
                        past_14_pushes_types_failures,
                        past_28_pushes_types_failures,
                        past_56_pushes_types_failures,
                    ) = get_and_update_past_failures("type", task,
                                                     commit_data["types"],
                                                     push_num, is_regression)

                    (
                        total_files_failures,
                        past_7_pushes_files_failures,
                        past_14_pushes_files_failures,
                        past_28_pushes_files_failures,
                        past_56_pushes_files_failures,
                    ) = get_and_update_past_failures("file", task,
                                                     commit_data["files"],
                                                     push_num, is_regression)

                    (
                        total_directories_failures,
                        past_7_pushes_directories_failures,
                        past_14_pushes_directories_failures,
                        past_28_pushes_directories_failures,
                        past_56_pushes_directories_failures,
                    ) = get_and_update_past_failures(
                        "directory",
                        task,
                        commit_data["directories"],
                        push_num,
                        is_regression,
                    )

                    (
                        total_components_failures,
                        past_7_pushes_components_failures,
                        past_14_pushes_components_failures,
                        past_28_pushes_components_failures,
                        past_56_pushes_components_failures,
                    ) = get_and_update_past_failures(
                        "component",
                        task,
                        commit_data["components"],
                        push_num,
                        is_regression,
                    )

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types":
                            past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types":
                            past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types":
                            past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types":
                            past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files":
                            past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files":
                            past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files":
                            past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files":
                            past_56_pushes_files_failures,
                            "failures_in_directories":
                            total_directories_failures,
                            "failures_past_7_pushes_in_directories":
                            past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories":
                            past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories":
                            past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories":
                            past_56_pushes_directories_failures,
                            "failures_in_components":
                            total_components_failures,
                            "failures_past_7_pushes_in_components":
                            past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components":
                            past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components":
                            past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components":
                            past_56_pushes_components_failures,
                            "is_possible_regression": task
                            in commit_push_data[1],
                            "is_likely_regression": task
                            in commit_push_data[2],
                        }

                # We no longer need the push data for this node, we can free the memory.
                del push_data[node]

                push_num += 1

            logger.info(
                f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        past_failures["push_num"] = push_num
        past_failures.close()
        zstd_compress("data/past_failures.shelve")
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"
            ), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(repository.COMMITS_DB) or not db.exists(
            repository.COMMITS_DB
        ):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["rev"]
        else:
            last_node = None

        try:
            with open("data/past_failures.pickle", "rb") as f:
                past_failures, push_num = pickle.load(f)
        except FileNotFoundError:
            past_failures = {}
            push_num = 0

        def get_and_update_past_failures(type_, task, items, push_num, is_regression):
            if type_ not in past_failures:
                past_failures[type_] = {}

            if task not in past_failures[type_]:
                past_failures[type_][task] = {}

            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            for item in items:
                if item not in past_failures[type_][task]:
                    past_failures[type_][task][item] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0
                    )

                value = past_failures[type_][task][item][push_num]

                values_total.append(value)
                values_prev_7.append(
                    value - past_failures[type_][task][item][push_num - 7]
                )
                values_prev_14.append(
                    value - past_failures[type_][task][item][push_num - 14]
                )
                values_prev_28.append(
                    value - past_failures[type_][task][item][push_num - 28]
                )
                values_prev_56.append(
                    value - past_failures[type_][task][item][push_num - 56]
                )

                if is_regression:
                    past_failures[type_][task][item][push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            commits_with_data = set()
            saved_nodes = set()

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                if node == last_node:
                    can_start = True
                    continue

                if not can_start:
                    continue

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
                        continue

                    is_regression = (
                        task in commit_push_data[1] or task in commit_push_data[2]
                    )

                    total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures(
                        "all", task, ["all"], push_num, is_regression
                    )

                    total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures(
                        "type", task, commit_data["types"], push_num, is_regression
                    )

                    total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures(
                        "file", task, commit_data["files"], push_num, is_regression
                    )

                    total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures(
                        "directory",
                        task,
                        commit_data["directories"],
                        push_num,
                        is_regression,
                    )

                    total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures(
                        "component",
                        task,
                        commit_data["components"],
                        push_num,
                        is_regression,
                    )

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types": past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types": past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types": past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types": past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files": past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files": past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files": past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files": past_56_pushes_files_failures,
                            "failures_in_directories": total_directories_failures,
                            "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures,
                            "failures_in_components": total_components_failures,
                            "failures_past_7_pushes_in_components": past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components": past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components": past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components": past_56_pushes_components_failures,
                            "is_possible_regression": task in commit_push_data[1],
                            "is_likely_regression": task in commit_push_data[2],
                        }

                push_num += 1

            logger.info(f"push data nodes: {len(push_data)}")

            logger.info(f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        with open("data/past_failures.pickle", "wb") as f:
            pickle.dump((past_failures, push_num), f, protocol=pickle.HIGHEST_PROTOCOL)

        zstd_compress("data/past_failures.pickle")
Ejemplo n.º 21
0
def find_bug_introducing_commits(cache_dir, git_repo_dir):
    mercurial_repo_dir = os.path.join(cache_dir, "mozilla-central")

    logger.info("Downloading Mercurial <-> git mapping file...")
    vcs_map.download_mapfile()

    logger.info(f"Cloning mercurial repository to {mercurial_repo_dir}...")
    repository.clone(mercurial_repo_dir)

    logger.info(f"Cloning git repository to {git_repo_dir}...")
    clone_gecko_dev(git_repo_dir)

    logger.info("Download previously found bug-introducing commits...")
    db.download_version(BUG_INTRODUCING_COMMITS_DB)
    if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists(
        BUG_INTRODUCING_COMMITS_DB
    ):
        db.download(BUG_INTRODUCING_COMMITS_DB, force=True)

    logger.info("Get previously found bug-introducing commits...")
    prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB))
    prev_bug_introducing_commits_nodes = set(
        bug_introducing_commit["bug_fixing_mercurial_rev"]
        for bug_introducing_commit in prev_bug_introducing_commits
    )
    logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...")

    commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir)

    git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore)

    with open("git_hashes_to_ignore", "w") as f:
        f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore)

    bug_fixing_commits = find_bug_fixing_commits()

    logger.info(f"{len(bug_fixing_commits)} commits to analyze")

    # Skip already found bug-introducing commits.
    bug_fixing_commits = [
        bug_fixing_commit
        for bug_fixing_commit in bug_fixing_commits
        if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes
    ]

    logger.info(
        f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
    )

    bug_fixing_commits = [
        bug_fixing_commit
        for bug_fixing_commit in bug_fixing_commits
        if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore
    ]
    logger.info(
        f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
    )

    def _init(git_repo_dir):
        global GIT_REPO
        GIT_REPO = GitRepository(git_repo_dir)

    def find_bic(bug_fixing_commit):
        logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"]))

        commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"])

        # Skip huge changes, we'll likely be wrong with them.
        if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
            return [None]

        bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines(
            commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")
        )
        logger.info(bug_introducing_modifications)

        bug_introducing_commits = []
        for bug_introducing_hashes in bug_introducing_modifications.values():
            for bug_introducing_hash in bug_introducing_hashes:
                bug_introducing_commits.append(
                    {
                        "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
                        "bug_fixing_git_rev": bug_fixing_commit["git_rev"],
                        "bug_introducing_mercurial_rev": vcs_map.git_to_mercurial(
                            bug_introducing_hash
                        ),
                        "bug_introducing_git_rev": bug_introducing_hash,
                    }
                )

        # Add an empty result, just so that we don't reanalyze this again.
        if len(bug_introducing_commits) == 0:
            bug_introducing_commits.append(
                {
                    "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
                    "bug_fixing_git_rev": bug_fixing_commit["git_rev"],
                    "bug_introducing_mercurial_rev": "",
                    "bug_introducing_git_rev": "",
                }
            )

        return bug_introducing_commits

    with concurrent.futures.ThreadPoolExecutor(
        initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1
    ) as executor:
        bug_introducing_commits = executor.map(find_bic, bug_fixing_commits)
        bug_introducing_commits = tqdm(
            bug_introducing_commits, total=len(bug_fixing_commits)
        )
        bug_introducing_commits = list(
            itertools.chain.from_iterable(bug_introducing_commits)
        )

    total_results_num = len(bug_introducing_commits)
    bug_introducing_commits = list(filter(None, bug_introducing_commits))
    logger.info(
        f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
    )

    db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits)
    compress_file(BUG_INTRODUCING_COMMITS_DB)