Esempio n. 1
0
    def find_bug_introducing_commits(self, bug_fixing_commits,
                                     commits_to_ignore, tokenized):
        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.tokenized_git_repo_dir
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.git_repo_dir

        def git_to_mercurial(rev):
            if tokenized:
                return self.tokenized_git_to_mercurial[rev]
            else:
                return vcs_map.git_to_mercurial(rev)

        def mercurial_to_git(rev):
            if tokenized:
                return self.mercurial_to_tokenized_git[rev]
            else:
                return vcs_map.mercurial_to_git(rev)

        logger.info("Download previously found bug-introducing commits...")
        if db.is_old_version(db_path) or not db.exists(db_path):
            db.download(db_path, force=True)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits)
        logger.info(
            f"Already classified {len(prev_bug_introducing_commits)} commits..."
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            f.writelines("{}\n".format(mercurial_to_git(commit["rev"]))
                         for commit in commits_to_ignore if not tokenized
                         or commit["rev"] in self.mercurial_to_tokenized_git)

        logger.info(f"{len(bug_fixing_commits)} commits to analyze")

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if
            bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
        )

        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash"
            )

        def _init(git_repo_dir):
            thread_local.git = GitRepository(git_repo_dir)

        def find_bic(bug_fixing_commit):
            logger.info("Analyzing {}...".format(bug_fixing_commit["rev"]))

            git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"])

            commit = thread_local.git.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                logger.info("Skipping {} as it is too big".format(
                    bug_fixing_commit["rev"]))
                return None

            bug_introducing_modifications = thread_local.git.get_commits_last_modified_lines(
                commit,
                hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore"))

            logger.info("Found {} for {}".format(bug_introducing_modifications,
                                                 bug_fixing_commit["rev"]))

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values(
            ):
                for bug_introducing_hash in bug_introducing_hashes:
                    try:
                        bug_introducing_commits.append({
                            "bug_fixing_rev":
                            bug_fixing_commit["rev"],
                            "bug_introducing_rev":
                            git_to_mercurial(bug_introducing_hash),
                        })
                    except Exception as e:
                        # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
                        if not str(e).startswith(
                                "Missing git commit in the VCS map"):
                            raise

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append({
                    "bug_fixing_rev":
                    bug_fixing_commit["rev"],
                    "bug_introducing_rev":
                    "",
                })

            return bug_introducing_commits

        bug_fixing_commits_queue = bug_fixing_commits.copy()

        with concurrent.futures.ThreadPoolExecutor(initializer=_init,
                                                   initargs=(repo_dir, ),
                                                   max_workers=os.cpu_count() +
                                                   1) as executor:

            def results():
                num_analyzed = 0

                # Analyze up to 500 commits at a time, to avoid the task running out of time.
                while len(
                        bug_fixing_commits_queue) != 0 and num_analyzed != 500:
                    bug_introducing_commit_futures = []
                    for _ in range(
                            min(500 - num_analyzed,
                                len(bug_fixing_commits_queue))):
                        bug_introducing_commit_futures.append(
                            executor.submit(find_bic,
                                            bug_fixing_commits_queue.pop()))

                    logger.info(
                        f"Analyzing a chunk of {len(bug_introducing_commit_futures)} commits"
                    )

                    for future in tqdm(
                            concurrent.futures.as_completed(
                                bug_introducing_commit_futures),
                            total=len(bug_introducing_commit_futures),
                    ):
                        result = future.result()
                        if result is not None:
                            num_analyzed += 1
                            yield from result

            db.append(db_path, results())

        zstd_compress(db_path)

        return len(bug_fixing_commits_queue) == 0
Esempio n. 2
0
    def generate_test_scheduling_history(self, granularity):
        push_data_path = f"push_data_{granularity}.json"
        updated = download_check_etag(
            test_scheduling.PUSH_DATA_URL.format(granularity=granularity))
        if updated:
            zstd_decompress(push_data_path)
        assert os.path.exists(
            push_data_path), "Decompressed push data file exists"

        # Get the commits DB.
        assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS[granularity])

        if granularity == "label":
            test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_LABEL_DB)
        elif granularity == "group":
            test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_GROUP_DB)
            touched_together_db = os.path.join(
                "data", test_scheduling.TOUCHED_TOGETHER_DB)

        db.download(test_scheduling_db, support_files_too=True)

        last_node = None
        for test_data in test_scheduling.get_test_scheduling_history(
                granularity):
            last_node = test_data["revs"][0]

        def generate_all_data():
            past_failures = test_scheduling.get_past_failures(granularity)

            push_num = past_failures[
                "push_num"] if "push_num" in past_failures else 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            with open(push_data_path, "r") as f:
                push_data = json.load(f)

            logger.info(f"push data nodes: {len(push_data)}")

            if granularity == "label":
                push_data = [(
                    revisions,
                    rename_tasks(push_tasks),
                    rename_tasks(possible_regressions),
                    rename_tasks(likely_regressions),
                ) for revisions, push_tasks, possible_regressions,
                             likely_regressions in push_data]

            # In the last 28 pushes, we definitely run all possible runnables.
            all_runnables_set = set(
                sum((push_runnables
                     for _, push_runnables, _, _ in push_data[-28:]), []))
            # Filter runnables we don't need.
            all_runnables = filter_runnables(list(all_runnables_set),
                                             all_runnables_set, granularity)
            all_runnables_set = set(all_runnables_set)
            logger.info(
                f"{len(all_runnables_set)} runnables run in the last 28 pushes"
            )

            # Store all runnables in the past_failures DB so it can be used in the evaluation phase.
            past_failures["all_runnables"] = all_runnables
            # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_runnables = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            if granularity == "group":
                update_touched_together_gen = test_scheduling.update_touched_together(
                )
                next(update_touched_together_gen)

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_runnables,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision) for revision in revisions
                    if revision in commit_map)
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 50:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_runnables, we'd generate a huge amount of data.
                # So we consider only the runnables which run in this push, and the possible and likely regressions
                # from this push.
                runnables_to_consider = list(
                    set(push_runnables + possible_regressions +
                        likely_regressions))
                runnables_to_consider = filter_runnables(
                    runnables_to_consider, all_runnables_set, granularity)

                if len(runnables_to_consider) == 0:
                    skipped_no_runnables += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                if granularity == "group":
                    update_touched_together_gen.send(commits[0]["node"])

                for data in test_scheduling.generate_data(
                        past_failures,
                        merged_commits,
                        push_num,
                        runnables_to_consider,
                        possible_regressions,
                        likely_regressions,
                ):
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(i)
                        data["revs"] = revisions
                        yield data

            if granularity == "group":
                try:
                    update_touched_together_gen.send(None)
                except StopIteration:
                    pass

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(
                f"skipped {skipped_no_runnables} (no interesting runnables)")

            past_failures["push_num"] = push_num
            past_failures.close()

        db.append(test_scheduling_db, generate_all_data())

        zstd_compress(test_scheduling_db)

        with open_tar_zst(past_failures_db) as tar:
            tar.add(past_failures_db[:-len(".tar.zst")])

        if granularity == "group":
            with open_tar_zst(touched_together_db) as tar:
                tar.add(touched_together_db[:-len(".tar.zst")])
Esempio n. 3
0
    def generate_push_data(self, runnable):
        # We keep in the cache the fact that we failed to analyze a push for 10
        # days, so if we re-run often we don't retry the same pushes many times.
        MISSING_CACHE_RETENTION = 10 * 24 * 60

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use half TRAINING_MONTHS months more than that to calculate the
        # failure statistics.
        from_months = TRAINING_MONTHS[runnable] + math.floor(
            TRAINING_MONTHS[runnable] / 2)

        pushes = mozci.push.make_push_objects(
            from_date=f"today-{from_months}month",
            to_date="today-3day",
            branch="autoland",
        )

        start_time = time.monotonic()

        num_cached = 0

        push_data = []

        def cache_key(push):
            return f"push_data.{runnable}.{push.rev}"

        # Regenerating a large amount of data when we update the mozci regression detection
        # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we
        # run.
        to_regenerate = set()
        """for push in pushes[::-1]:
            cached = adr.config.cache.get(cache_key(push))
            if not cached:
                continue

            value, mozci_version = cached
            if mozci_version != MOZCI_VERSION and len(to_regenerate) < 1000:
                to_regenerate.add(value[0][0])"""

        for push in tqdm(pushes):
            key = cache_key(push)

            if adr.config.cache.has(key) and push.revs[0] not in to_regenerate:
                num_cached += 1
                cached = adr.config.cache.get(key)
                if cached:
                    value, mozci_version = cached
                    push_data.append(value)
            else:
                logger.info(f"Analyzing {push.rev} at the {runnable} level...")

                try:
                    if runnable == "label":
                        runnables = push.task_labels
                    elif runnable == "group":
                        runnables = push.group_summaries.keys()

                    value = [
                        push.revs,
                        list(runnables),
                        list(push.get_possible_regressions(runnable)),
                        list(push.get_likely_regressions(runnable)),
                    ]
                    push_data.append(value)
                    adr.config.cache.forever(key, (value, MOZCI_VERSION))
                except adr.errors.MissingDataError:
                    logger.warning(
                        f"Tasks for push {push.rev} can't be found on ActiveData"
                    )
                    adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)
                except Exception:
                    traceback.print_exc()
                    adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)

            if time.monotonic() - start_time >= 10800:
                self.upload_adr_cache()
                start_time = time.monotonic()

        logger.info(
            f"{num_cached} pushes were already cached out of {len(pushes)}")

        with open(f"push_data_{runnable}.json", "w") as f:
            json.dump(push_data, f)

        zstd_compress(f"push_data_{runnable}.json")
    def generate_push_data(self, granularity: str, training_months: int,
                           reretrieve: int) -> None:
        # We'll use the past training_months months only for training the model,
        # but we use half training_months months more than that to calculate the
        # failure statistics.
        from_months = training_months + math.floor(training_months / 2)

        # We use the actual date instead of 'today-X' aliases to avoid adr caching
        # this query.
        from_date = datetime.utcnow() - relativedelta(months=from_months)
        to_date = datetime.utcnow() - relativedelta(days=3)

        pushes = mozci.push.make_push_objects(
            from_date=from_date.strftime("%Y-%m-%d"),
            to_date=to_date.strftime("%Y-%m-%d"),
            branch="autoland",
        )

        if granularity == "label":
            push_data_db = test_scheduling.PUSH_DATA_LABEL_DB
        elif granularity == "group":
            push_data_db = test_scheduling.PUSH_DATA_GROUP_DB
        elif granularity == "config_group":
            push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB

        def cache_key(push: mozci.push.Push) -> str:
            return f"push_data.{granularity}.{push.rev}"

        def generate(
            futures: List[concurrent.futures.Future],
        ) -> Generator[PushResult, None, None]:
            nonlocal reretrieve
            num_cached = 0
            num_pushes = len(pushes)

            for _ in tqdm(range(num_pushes)):
                push = pushes.pop(0)
                cached = futures.pop(0).result()

                semaphore.release()

                # Regenerating a large amount of data when we update the mozci regression detection
                # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we
                # run.
                if cached:
                    value, mozci_version = cached

                    # Regenerate results which were generated with an older version of mozci.
                    if reretrieve > 0 and mozci_version != MOZCI_VERSION:
                        cached = None
                        reretrieve -= 1

                    # Regenerate results which don't contain the fix revision.
                    elif len(value) != 5:
                        cached = None

                if cached:
                    num_cached += 1
                    value, mozci_version = cached
                    assert len(value) == 5
                    yield value
                else:
                    logger.info(
                        f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            tuple(push.revs),
                            push.backedoutby or push.bustage_fixed_by,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        adr.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            adr.config["cache"]["retention"],
                        )
                        assert len(value) == 5
                        yield value
                    except adr.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                    except Exception:
                        traceback.print_exc()

            logger.info(
                f"{num_cached} pushes were already cached out of {num_pushes}")

        semaphore = threading.BoundedSemaphore(256)

        def retrieve_from_cache(push):
            semaphore.acquire()
            return adr.config.cache.get(cache_key(push))

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(retrieve_from_cache, push) for push in pushes
            ]

            try:
                db.write(push_data_db, generate(futures))
            except Exception:
                for f in futures:
                    f.cancel()

                    try:
                        semaphore.release()
                    except ValueError:
                        continue

                raise

        zstd_compress(push_data_db)
    def generate_push_data(self, runnable):
        # We keep in the cache the fact that we failed to analyze a push for 10
        # days, so if we re-run often we don't retry the same pushes many times.
        MISSING_CACHE_RETENTION = 10 * 24 * 60

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use half TRAINING_MONTHS months more than that to calculate the
        # failure statistics.
        from_months = TRAINING_MONTHS[runnable] + math.floor(
            TRAINING_MONTHS[runnable] / 2)

        # We use the actual date instead of 'today-X' aliases to avoid adr caching
        # this query.
        from_date = datetime.utcnow() - relativedelta(months=from_months)
        to_date = datetime.utcnow() - relativedelta(days=3)

        pushes = mozci.push.make_push_objects(
            from_date=from_date.strftime("%Y-%m-%d"),
            to_date=to_date.strftime("%Y-%m-%d"),
            branch="autoland",
        )

        num_cached = 0

        push_data = []

        def cache_key(push):
            return f"push_data.{runnable}.{push.rev}"

        # Regenerating a large amount of data when we update the mozci regression detection
        # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we
        # run.
        to_regenerate = set()
        """for push in pushes[::-1]:
            cached = adr.config.cache.get(cache_key(push))
            if not cached:
                continue

            value, mozci_version = cached
            if mozci_version != MOZCI_VERSION and len(to_regenerate) < 1000:
                to_regenerate.add(value[0][0])"""

        def periodically_upload_adr_cache():
            start_time = time.monotonic()
            while not upload_thread_stop.isSet():
                if time.monotonic() - start_time >= 10800:
                    self.upload_adr_cache()
                    start_time = time.monotonic()

                upload_thread_stop.wait(timeout=7)

        upload_thread = threading.Thread(target=periodically_upload_adr_cache)
        upload_thread_stop = threading.Event()
        upload_thread.start()

        s3_store = adr.util.cache_stores.S3Store({
            "bucket": "communitytc-bugbug",
            "prefix": "data/adr_cache/",
        })

        s3_store.set_serializer(CompressedPickleSerializer())

        for push in tqdm(pushes):
            key = cache_key(push)

            if adr.config.cache.has(key) and push.revs[0] not in to_regenerate:
                num_cached += 1
                cached = adr.config.cache.get(key)
                if cached:
                    s3_store.put(key, cached, adr.config["cache"]["retention"])
                    value, mozci_version = cached
                    push_data.append(value)
            else:
                logger.info(f"Analyzing {push.rev} at the {runnable} level...")

                try:
                    if runnable == "label":
                        runnables = push.task_labels
                    elif runnable == "group":
                        runnables = push.group_summaries.keys()

                    value = [
                        push.revs,
                        list(runnables),
                        list(push.get_possible_regressions(runnable)),
                        list(push.get_likely_regressions(runnable)),
                    ]
                    push_data.append(value)
                    adr.config.cache.put(key, (value, MOZCI_VERSION),
                                         adr.config["cache"]["retention"])
                    s3_store.put(key, (value, MOZCI_VERSION),
                                 adr.config["cache"]["retention"])
                except adr.errors.MissingDataError:
                    logger.warning(
                        f"Tasks for push {push.rev} can't be found on ActiveData"
                    )
                    adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)
                except Exception:
                    traceback.print_exc()
                    adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)

        upload_thread_stop.set()
        upload_thread.join()

        logger.info(
            f"{num_cached} pushes were already cached out of {len(pushes)}")

        with open(f"push_data_{runnable}.json", "w") as f:
            json.dump(push_data, f)

        zstd_compress(f"push_data_{runnable}.json")
Esempio n. 6
0
    def find_bug_introducing_commits(
        self, bug_fixing_commits, commits_to_ignore, tokenized
    ):
        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.tokenized_git_repo_dir
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.git_repo_dir

        def git_to_mercurial(rev):
            if tokenized:
                return self.tokenized_git_to_mercurial[rev]
            else:
                return vcs_map.git_to_mercurial(rev)

        def mercurial_to_git(rev):
            if tokenized:
                return self.mercurial_to_tokenized_git[rev]
            else:
                return vcs_map.mercurial_to_git(rev)

        logger.info("Download previously found bug-introducing commits...")
        if db.is_old_version(db_path) or not db.exists(db_path):
            db.download(db_path, force=True)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits
        )
        logger.info(
            f"Already classified {len(prev_bug_introducing_commits)} commits..."
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            f.writelines(
                "{}\n".format(mercurial_to_git(commit["rev"]))
                for commit in commits_to_ignore
                if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git
            )

        logger.info(f"{len(bug_fixing_commits)} commits to analyze")

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit
            for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
        )

        bug_fixing_commits = [
            bug_fixing_commit
            for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit
                for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash"
            )

        # Analyze up to 500 commits at a time, to avoid the task running out of time.
        done = True
        if len(bug_fixing_commits) > 500:
            bug_fixing_commits = bug_fixing_commits[-500:]
            done = False

        with open("done", "w") as f:
            f.write(str(1 if done else 0))

        def _init(git_repo_dir):
            global GIT_REPO
            GIT_REPO = GitRepository(git_repo_dir)

        def find_bic(bug_fixing_commit):
            logger.info("Analyzing {}...".format(bug_fixing_commit["rev"]))

            git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"])

            commit = GIT_REPO.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                return [None]

            bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines(
                commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")
            )
            logger.info(
                "Found {} for {}".format(
                    bug_introducing_modifications, bug_fixing_commit["rev"]
                )
            )

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values():
                for bug_introducing_hash in bug_introducing_hashes:
                    try:
                        bug_introducing_commits.append(
                            {
                                "bug_fixing_rev": bug_fixing_commit["rev"],
                                "bug_introducing_rev": git_to_mercurial(
                                    bug_introducing_hash
                                ),
                            }
                        )
                    except Exception as e:
                        # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
                        if not str(e).startswith("Missing git commit in the VCS map"):
                            raise

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append(
                    {
                        "bug_fixing_rev": bug_fixing_commit["rev"],
                        "bug_introducing_rev": "",
                    }
                )

            return bug_introducing_commits

        with concurrent.futures.ThreadPoolExecutor(
            initializer=_init, initargs=(repo_dir,), max_workers=os.cpu_count() + 1
        ) as executor:
            bug_introducing_commits = executor.map(find_bic, bug_fixing_commits)
            bug_introducing_commits = tqdm(
                bug_introducing_commits, total=len(bug_fixing_commits)
            )
            bug_introducing_commits = list(
                itertools.chain.from_iterable(bug_introducing_commits)
            )

        total_results_num = len(bug_introducing_commits)
        bug_introducing_commits = list(filter(None, bug_introducing_commits))
        logger.info(
            f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
        )

        db.append(db_path, bug_introducing_commits)
        zstd_compress(db_path)
Esempio n. 7
0
 def compress_and_upload():
     zstd_compress(db_path)
     db.upload(db_path)
    def go(self):
        logger.info(
            "Generate map of bug ID -> bug data for all bugs which were defects"
        )
        bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))

        bug_fixing_commits_nodes = set(
            bug_fixing_commit["rev"]
            for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["type"] in ("d", "r")
        )

        all_bug_ids = set(
            commit["bug_id"]
            for commit in repository.get_commits()
            if commit["node"] in bug_fixing_commits_nodes
        )

        bug_map = {}

        for bug in bugzilla.get_bugs():
            if bug["id"] not in all_bug_ids:
                continue

            bug_map[bug["id"]] = bug

        logger.info(
            "Generate a map from function to the three last bugs which were fixed by touching that function"
        )

        past_bugs_by_function = {}

        for commit in tqdm(repository.get_commits()):
            if commit["node"] not in bug_fixing_commits_nodes:
                continue

            if commit["bug_id"] not in bug_map:
                continue

            bug = bug_map[commit["bug_id"]]

            bug_str = "Bug {} - {}".format(bug["id"], bug["summary"])

            for path, f_group in commit["functions"].items():
                if path not in past_bugs_by_function:
                    past_bugs_by_function[path] = {}

                for f in f_group:
                    if f[0] not in past_bugs_by_function[path]:
                        bugs_deque = deque([bug_str], maxlen=3)
                    else:
                        bugs_deque = past_bugs_by_function[path][f[0]]["bugs"]

                    bugs_deque.append(bug_str)

                    past_bugs_by_function[path][f[0]] = {
                        "start": f[1],
                        "end": f[2],
                        "bugs": bugs_deque,
                    }

        with open("data/past_bugs_by_function.pickle", "wb") as f:
            pickle.dump(past_bugs_by_function, f, protocol=pickle.HIGHEST_PROTOCOL)
        zstd_compress("data/past_bugs_by_function.pickle")
    def generate_test_scheduling_history(self):
        updated = download_check_etag(PUSH_DATA_URL)
        if updated:
            zstd_decompress("push_data.json")
        assert os.path.exists(
            "push_data.json"), "Decompressed push data file exists"

        # Get the commits DB.
        assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True)

        last_node = None
        for test_data in test_scheduling.get_test_scheduling_history():
            last_node = test_data["revs"][0]

        def generate_all_data():
            past_failures = test_scheduling.get_past_failures()

            push_num = past_failures[
                "push_num"] if "push_num" in past_failures else 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            with open("push_data.json", "r") as f:
                push_data = json.load(f)[1:]

            logger.info(f"push data nodes: {len(push_data)}")

            # In the last 28 pushes, we definitely run all possible tasks.
            all_tasks_set = set(
                sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]),
                    []))
            # Filter tasks we don't need.
            all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set)
            all_tasks_set = set(all_tasks)
            logger.info(
                f"{len(all_tasks_set)} tasks run in the last 28 pushes")

            # Store all tasks in the past_failures DB so it can be used in the evaluation phase.
            past_failures["all_tasks"] = all_tasks
            # XXX: Should we recreate the DB from scratch if the previous all_tasks are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_tasks = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_tasks,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision) for revision in revisions
                    if revision in commit_map)
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 50:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_tasks, we'd generate a huge amount of data.
                # So we consider only the tasks which run in this push, and the possible and likely regressions
                # from this push.
                tasks_to_consider = list(
                    set(push_tasks + possible_regressions +
                        likely_regressions))
                tasks_to_consider = filter_tasks(tasks_to_consider,
                                                 all_tasks_set)

                if len(tasks_to_consider) == 0:
                    skipped_no_tasks += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                for data in test_scheduling.generate_data(
                        past_failures,
                        merged_commits,
                        push_num,
                        tasks_to_consider,
                        possible_regressions,
                        likely_regressions,
                ):
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(i)
                        data["revs"] = revisions
                        yield data

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)")

            past_failures["push_num"] = push_num
            past_failures.close()

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_all_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar:
            tar.add("data/past_failures.lmdb")
Esempio n. 10
0
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"
            ), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(repository.COMMITS_DB) or not db.exists(
            repository.COMMITS_DB
        ):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        logger.info(f"push data nodes: {len(push_data)}")

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["rev"]
        else:
            last_node = None

        past_failures = shelve.open(
            "data/past_failures.shelve",
            protocol=pickle.HIGHEST_PROTOCOL,
            writeback=True,
        )

        push_num = past_failures["push_num"] if "push_num" in past_failures else 0

        def get_and_update_past_failures(type_, task, items, push_num, is_regression):
            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            key = f"{type_}${task}$"

            for item in items:
                full_key = key + item

                if full_key not in past_failures:
                    cur = past_failures[full_key] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0
                    )
                else:
                    cur = past_failures[full_key]

                value = cur[push_num]

                values_total.append(value)
                values_prev_7.append(value - cur[push_num - 7])
                values_prev_14.append(value - cur[push_num - 14])
                values_prev_28.append(value - cur[push_num - 28])
                values_prev_56.append(value - cur[push_num - 56])

                if is_regression:
                    cur[push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            commits_with_data = set()
            saved_nodes = set()

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                # Sync DB every 1000 commits, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if len(commits_with_data) % 1000 == 0:
                    past_failures.sync()

                if node == last_node:
                    can_start = True
                    continue

                if not can_start:
                    continue

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
                        continue

                    is_regression = (
                        task in commit_push_data[1] or task in commit_push_data[2]
                    )

                    total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures(
                        "all", task, ["all"], push_num, is_regression
                    )

                    total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures(
                        "type", task, commit_data["types"], push_num, is_regression
                    )

                    total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures(
                        "file", task, commit_data["files"], push_num, is_regression
                    )

                    total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures(
                        "directory",
                        task,
                        commit_data["directories"],
                        push_num,
                        is_regression,
                    )

                    total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures(
                        "component",
                        task,
                        commit_data["components"],
                        push_num,
                        is_regression,
                    )

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types": past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types": past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types": past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types": past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files": past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files": past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files": past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files": past_56_pushes_files_failures,
                            "failures_in_directories": total_directories_failures,
                            "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures,
                            "failures_in_components": total_components_failures,
                            "failures_past_7_pushes_in_components": past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components": past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components": past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components": past_56_pushes_components_failures,
                            "is_possible_regression": task in commit_push_data[1],
                            "is_likely_regression": task in commit_push_data[2],
                        }

                # We no longer need the push data for this node, we can free the memory.
                del push_data[node]

                push_num += 1

            logger.info(f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        past_failures["push_num"] = push_num
        past_failures.close()
        zstd_compress("data/past_failures.shelve")
    def generate_push_data(self, pushes: List[mozci.push.Push],
                           granularity: str) -> None:
        # We keep in the cache the fact that we failed to analyze a push for 10
        # days, so if we re-run often we don't retry the same pushes many times.
        MISSING_CACHE_RETENTION = 10 * 24 * 60

        from_date = get_from_date(granularity)

        pushes = [
            push for push in pushes
            if datetime.utcfromtimestamp(push.date) >= from_date
        ]

        if granularity == "label":
            push_data_db = test_scheduling.PUSH_DATA_LABEL_DB
        elif granularity == "group":
            push_data_db = test_scheduling.PUSH_DATA_GROUP_DB
        elif granularity == "config_group":
            push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB

        def cache_key(push: mozci.push.Push) -> str:
            return f"push_data.{granularity}.{push.rev}"

        def generate(executor) -> Generator[PushResult, None, None]:
            num_cached = 0
            num_pushes = len(pushes)

            # Regenerating a large amount of data when we update the mozci regression detection
            # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we
            # run.
            to_regenerate = 1000

            semaphore = threading.BoundedSemaphore(256)

            def retrieve_from_cache(push):
                semaphore.acquire()
                return adr.config.cache.get(cache_key(push))

            futures = tuple(
                executor.submit(retrieve_from_cache, push) for push in pushes)

            for push, future in zip(tqdm(pushes), futures):
                exc = future.exception()
                if exc is not None:
                    logger.info(f"Exception {exc} while getting {push.rev}")
                    for f in futures:
                        f.cancel()

                cached = future.result()

                semaphore.release()

                if cached and to_regenerate > 0:
                    value, mozci_version = cached

                    # Regenerate results which were generated when we were not cleaning
                    # up WPT groups.
                    if any(runnable.startswith("/") for runnable in value[1]):
                        cached = None
                        to_regenerate -= 1
                    """# Regenerate results which were generated with an older version of mozci.
                    elif mozci_version != MOZCI_VERSION and to_regenerate > 0:
                        cached = None
                        to_regenerate -= 1"""

                if cached is not None:
                    num_cached += 1
                    if cached:
                        value, mozci_version = cached
                        yield value
                else:
                    logger.info(
                        f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            push.revs,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        adr.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            adr.config["cache"]["retention"],
                        )
                        yield value
                    except adr.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                        adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)
                    except Exception:
                        traceback.print_exc()
                        adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)

            logger.info(
                f"{num_cached} pushes were already cached out of {num_pushes}")

        with concurrent.futures.ThreadPoolExecutor() as executor:
            db.write(push_data_db, generate(executor))
        zstd_compress(push_data_db)
Esempio n. 12
0
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB,
                        support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["revs"][0]
        else:
            last_node = None

        past_failures = shelve.Shelf(
            LMDBDict("data/past_failures.lmdb"),
            protocol=pickle.HIGHEST_PROTOCOL,
            writeback=True,
        )

        push_num = past_failures[
            "push_num"] if "push_num" in past_failures else 0

        def get_and_update_past_failures(type_, task, items, push_num,
                                         is_regression):
            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            key = f"{type_}${task}$"

            for item in items:
                full_key = key + item

                if full_key not in past_failures:
                    cur = past_failures[full_key] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0)
                else:
                    cur = past_failures[full_key]

                value = cur[push_num]

                values_total.append(value)
                values_prev_7.append(value - cur[push_num - 7])
                values_prev_14.append(value - cur[push_num - 14])
                values_prev_28.append(value - cur[push_num - 28])
                values_prev_56.append(value - cur[push_num - 56])

                if is_regression:
                    cur[push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_tasks = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            with open("push_data.json", "r") as f:
                push_data = json.load(f)[1:]

            logger.info(f"push data nodes: {len(push_data)}")

            # In the last 28 pushes, we definitely run all possible tasks.
            all_tasks_set = set(
                sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]),
                    []))
            # Filter tasks we don't need.
            all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set)
            all_tasks_set = set(all_tasks)
            logger.info(
                f"{len(all_tasks_set)} tasks run in the last 28 pushes")

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_tasks,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision) for revision in revisions
                    if revision in commit_map)
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 20:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_tasks, we'd generate a huge amount of data.
                # So we consider only the tasks which run in this push, and the possible and likely regressions
                # from this push.
                tasks_to_consider = list(
                    set(push_tasks + possible_regressions +
                        likely_regressions))
                tasks_to_consider = filter_tasks(tasks_to_consider,
                                                 all_tasks_set)

                if len(tasks_to_consider) == 0:
                    skipped_no_tasks += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                for task in tasks_to_consider:
                    is_regression = (task in possible_regressions
                                     or task in likely_regressions)

                    (
                        total_failures,
                        past_7_pushes_failures,
                        past_14_pushes_failures,
                        past_28_pushes_failures,
                        past_56_pushes_failures,
                    ) = get_and_update_past_failures("all", task, ["all"],
                                                     push_num, is_regression)

                    (
                        total_types_failures,
                        past_7_pushes_types_failures,
                        past_14_pushes_types_failures,
                        past_28_pushes_types_failures,
                        past_56_pushes_types_failures,
                    ) = get_and_update_past_failures("type", task,
                                                     merged_commits["types"],
                                                     push_num, is_regression)

                    (
                        total_files_failures,
                        past_7_pushes_files_failures,
                        past_14_pushes_files_failures,
                        past_28_pushes_files_failures,
                        past_56_pushes_files_failures,
                    ) = get_and_update_past_failures("file", task,
                                                     merged_commits["files"],
                                                     push_num, is_regression)

                    (
                        total_directories_failures,
                        past_7_pushes_directories_failures,
                        past_14_pushes_directories_failures,
                        past_28_pushes_directories_failures,
                        past_56_pushes_directories_failures,
                    ) = get_and_update_past_failures(
                        "directory",
                        task,
                        merged_commits["directories"],
                        push_num,
                        is_regression,
                    )

                    (
                        total_components_failures,
                        past_7_pushes_components_failures,
                        past_14_pushes_components_failures,
                        past_28_pushes_components_failures,
                        past_56_pushes_components_failures,
                    ) = get_and_update_past_failures(
                        "component",
                        task,
                        merged_commits["components"],
                        push_num,
                        is_regression,
                    )

                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(i)

                        yield {
                            "revs": revisions,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types":
                            past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types":
                            past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types":
                            past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types":
                            past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files":
                            past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files":
                            past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files":
                            past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files":
                            past_56_pushes_files_failures,
                            "failures_in_directories":
                            total_directories_failures,
                            "failures_past_7_pushes_in_directories":
                            past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories":
                            past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories":
                            past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories":
                            past_56_pushes_directories_failures,
                            "failures_in_components":
                            total_components_failures,
                            "failures_past_7_pushes_in_components":
                            past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components":
                            past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components":
                            past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components":
                            past_56_pushes_components_failures,
                            "is_possible_regression": task
                            in possible_regressions,
                            "is_likely_regression": task in likely_regressions,
                        }

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        past_failures["push_num"] = push_num
        past_failures.close()
        with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar:
            tar.add("data/past_failures.lmdb")
    def generate_push_data(self, granularity: str) -> None:
        # We keep in the cache the fact that we failed to analyze a push for 10
        # days, so if we re-run often we don't retry the same pushes many times.
        MISSING_CACHE_RETENTION = 10 * 24 * 60

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use half TRAINING_MONTHS months more than that to calculate the
        # failure statistics.
        from_months = TRAINING_MONTHS[granularity] + math.floor(
            TRAINING_MONTHS[granularity] / 2
        )

        # We use the actual date instead of 'today-X' aliases to avoid adr caching
        # this query.
        from_date = datetime.utcnow() - relativedelta(months=from_months)
        to_date = datetime.utcnow() - relativedelta(days=3)

        pushes = mozci.push.make_push_objects(
            from_date=from_date.strftime("%Y-%m-%d"),
            to_date=to_date.strftime("%Y-%m-%d"),
            branch="autoland",
        )

        if granularity == "label":
            push_data_db = test_scheduling.PUSH_DATA_LABEL_DB
        elif granularity == "group":
            push_data_db = test_scheduling.PUSH_DATA_GROUP_DB

        cache: Dict[mozci.push.Push, Tuple[PushResult, int]] = {}

        def cache_key(push: mozci.push.Push) -> str:
            return f"push_data.{granularity}.{push.rev}"

        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_push = {
                executor.submit(
                    lambda push: adr.config.cache.get(cache_key(push)), push
                ): push
                for push in pushes
            }

            for future in tqdm(
                concurrent.futures.as_completed(future_to_push),
                total=len(future_to_push),
            ):
                push = future_to_push[future]

                exc = future.exception()
                if exc is not None:
                    logger.info(f"Exception {exc} while getting {push.rev}")
                    for f in future_to_push.keys():
                        f.cancel()

                cache[push] = future.result()

        # Regenerating a large amount of data when we update the mozci regression detection
        # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we
        # run.
        """to_regenerate = 0
        for push in pushes[::-1]:
            cached = cache[push]
            if not cached:
                continue

            value, mozci_version = cached
            if mozci_version != MOZCI_VERSION and to_regenerate < 1000:
                cache[push] = None
                to_regenerate += 1"""

        to_regenerate = 0
        for push in pushes[::-1]:
            cached = cache[push]
            if not cached:
                continue

            if to_regenerate < 1000:
                del cache[push]
                adr.config.cache.put(push.push_uuid, {}, 0)
                to_regenerate += 1

        def generate() -> Generator[PushResult, None, None]:
            num_cached = 0

            for push in tqdm(pushes):
                key = cache_key(push)

                if push in cache and cache[push] is not None:
                    num_cached += 1
                    cached = cache[push]
                    if cached:
                        value, mozci_version = cached
                        yield value
                else:
                    logger.info(f"Analyzing {push.rev} at the {granularity} level...")

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()

                        value = (
                            push.revs,
                            list(runnables),
                            list(push.get_possible_regressions(granularity)),
                            list(push.get_likely_regressions(granularity)),
                        )
                        adr.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            adr.config["cache"]["retention"],
                        )
                        yield value
                    except adr.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                        adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)
                    except Exception:
                        traceback.print_exc()
                        adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)

            logger.info(f"{num_cached} pushes were already cached out of {len(pushes)}")

        db.write(push_data_db, generate())
        zstd_compress(push_data_db)
Esempio n. 14
0
    def get_commits_to_ignore(self):
        logger.info("Download previous commits to ignore...")
        db.download(IGNORED_COMMITS_DB)

        logger.info("Get previously classified commits...")
        prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB))
        logger.info(f"Already found {len(prev_commits_to_ignore)} commits to ignore...")

        # When we already have some analyzed commits, re-analyze the last 3500 to make sure
        # we didn't miss back-outs that happened since the last analysis.
        if len(prev_commits_to_ignore) > 0:
            first_commit_to_reanalyze = (
                -3500 if len(prev_commits_to_ignore) >= 3500 else 0
            )
            rev_start = "children({})".format(
                prev_commits_to_ignore[first_commit_to_reanalyze]["rev"]
            )
        else:
            rev_start = 0

        with hglib.open(self.mercurial_repo_dir) as hg:
            revs = repository.get_revs(hg, rev_start)

        # Drop commits which are not yet present in the mercurial <-> git mapping.
        while len(revs) > 0:
            try:
                vcs_map.mercurial_to_git(revs[-1].decode("ascii"))
                break
            except Exception as e:
                if not str(e).startswith("Missing mercurial commit in the VCS map"):
                    raise

                revs.pop()

        commits = repository.hg_log_multi(self.mercurial_repo_dir, revs)

        repository.set_commits_to_ignore(self.mercurial_repo_dir, commits)

        chosen_commits = set()
        commits_to_ignore = []
        for commit in commits:
            if commit.ignored or commit.backedoutby:
                commits_to_ignore.append(
                    {
                        "rev": commit.node,
                        "type": "backedout" if commit.backedoutby else "",
                    }
                )
                chosen_commits.add(commit.node)

        logger.info(f"{len(commits_to_ignore)} new commits to ignore...")

        for prev_commit in prev_commits_to_ignore[::-1]:
            if prev_commit["rev"] not in chosen_commits:
                commits_to_ignore.append(prev_commit)
                chosen_commits.add(prev_commit["rev"])

        logger.info(f"{len(commits_to_ignore)} commits to ignore...")

        logger.info(
            "...of which {} are backed-out".format(
                sum(1 for commit in commits_to_ignore if commit["type"] == "backedout")
            )
        )

        db.write(IGNORED_COMMITS_DB, commits_to_ignore)
        zstd_compress(IGNORED_COMMITS_DB)
        db.upload(IGNORED_COMMITS_DB)
Esempio n. 15
0
 def retrieve_issues(
     self, owner: str, repo: str, state: str, retrieve_events: bool
 ) -> None:
     db.download(github.GITHUB_ISSUES_DB)
     github.download_issues(owner, repo, state, retrieve_events)
     zstd_compress(github.GITHUB_ISSUES_DB)
Esempio n. 16
0
    def generate_push_data(self, granularity: str, training_months: int,
                           reretrieve: int) -> None:
        # We'll use the past training_months months only for training the model,
        # but we use half training_months months more than that to calculate the
        # failure statistics.
        from_months = training_months + math.floor(training_months / 2)

        # We use the actual date instead of 'today-X' aliases to avoid mozci caching
        # this query.
        from_date = datetime.utcnow() - relativedelta(months=from_months)
        to_date = datetime.utcnow() - relativedelta(days=3)

        if granularity == "label":
            push_data_db = test_scheduling.PUSH_DATA_LABEL_DB
        elif granularity == "group":
            push_data_db = test_scheduling.PUSH_DATA_GROUP_DB
        elif granularity == "config_group":
            push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB

        def cache_key(push: mozci.push.Push) -> str:
            return f"push_data.{granularity}.{push.rev}"

        def generate(
            progress_bar: tqdm,
            pushes: list[mozci.push.Push],
            futures: list[concurrent.futures.Future],
        ) -> Generator[PushResult, None, None]:
            nonlocal reretrieve
            num_cached = 0
            num_pushes = len(pushes)
            num_errors = 0

            for push, future in zip(pushes, futures):
                cached = future.result()

                # Regenerating a large amount of data when we update the mozci regression detection
                # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we
                # run.
                if cached:
                    value, mozci_version = cached

                    # Regenerate results which were generated with an older version of mozci.
                    if reretrieve > 0 and mozci_version != MOZCI_VERSION:
                        cached = None
                        reretrieve -= 1

                if cached:
                    num_cached += 1
                    value, mozci_version = cached
                    assert len(value) == 5
                    if value != "ERROR":
                        yield value
                    else:
                        num_errors += 1
                else:
                    logger.info(
                        f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.label_summaries.keys()
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            tuple(push.revs),
                            push.backedoutby or push.bustage_fixed_by,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        mozci.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            mozci.config["cache"]["retention"],
                        )
                        assert len(value) == 5
                        yield value
                    except mozci.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                    except Exception:
                        num_errors += 1
                        traceback.print_exc()
                        mozci.config.cache.put(
                            key,
                            ("ERROR", MOZCI_VERSION),
                            mozci.config["cache"]["retention"],
                        )

                progress_bar.update(1)

            logger.info(
                f"{num_cached} pushes were already cached out of {num_pushes}")
            logger.info(f"There were errors in {num_errors} pushes")

        def retrieve_from_cache(push):
            return mozci.config.cache.get(cache_key(push))

        total_pushes = len(
            mozci.push.make_push_objects(
                from_date=from_date.strftime("%Y-%m-%d"),
                to_date=to_date.strftime("%Y-%m-%d"),
                branch="autoland",
            ))

        with concurrent.futures.ThreadPoolExecutor() as executor:
            with tqdm(total=total_pushes) as progress_bar:
                # Run in batches of 7 days to avoid running out of memory (given that mozci pushes
                # consume a lot of memory, and they all have references to each other through "parent"
                # and "child" links so they are basically never released while we run this).
                while from_date < to_date:
                    next_from_date = from_date + relativedelta(days=7)
                    if next_from_date > to_date:
                        next_from_date = to_date

                    pushes = mozci.push.make_push_objects(
                        from_date=from_date.strftime("%Y-%m-%d"),
                        to_date=next_from_date.strftime("%Y-%m-%d"),
                        branch="autoland",
                    )

                    futures = [
                        executor.submit(retrieve_from_cache, push)
                        for push in pushes
                    ]

                    try:
                        db.append(push_data_db,
                                  generate(progress_bar, pushes, futures))
                    except Exception:
                        for f in futures:
                            f.cancel()

                        raise

                    from_date = next_from_date

        zstd_compress(push_data_db)
Esempio n. 17
0
    def retrieve_bugs(self, limit=None):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids({
            "f1": "delta_ts",
            "o1": "greaterthaneq",
            "v1": last_modified.date()
        })
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago,
                                                six_months_ago)
        if limit:
            timespan_ids = timespan_ids[:limit]
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        if limit:
            labelled_bug_ids = labelled_bug_ids[:limit]
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
        # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped.
        if limit is None:
            assert db.download(repository.COMMITS_DB)

        # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
        start_date = datetime.now() - relativedelta(years=2, months=6)
        commit_bug_ids = [
            commit["bug_id"] for commit in repository.get_commits()
            if commit["bug_id"]
            and dateutil.parser.parse(commit["pushdate"]) >= start_date
        ]
        if limit:
            commit_bug_ids = commit_bug_ids[-limit:]
        logger.info(
            f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which caused regressions fixed by commits (useful for the regressor model).
        regressed_by_bug_ids = sum(
            [
                bug["regressed_by"]
                for bug in bugzilla.get_bugs() if bug["id"] in commit_bug_ids
            ],
            [],
        )
        if limit:
            regressed_by_bug_ids = regressed_by_bug_ids[-limit:]
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids +
                   regressed_by_bug_ids)
        all_ids_set = set(all_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"]
                             not in all_ids_set)

        bugzilla.download_bugs(all_ids)

        # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs).
        regressed_by_bug_ids = sum(
            [
                bug["regressed_by"]
                for bug in bugzilla.get_bugs() if bug["id"] in commit_bug_ids
            ],
            [],
        )
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        bugzilla.download_bugs(regressed_by_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs(include_invalid=True)
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        zstd_compress("data/bugs.json")
Esempio n. 18
0
    def go(self) -> None:
        logger.info(
            "Generate map of bug ID -> bug data for all bugs which were defects"
        )
        bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))

        bug_fixing_commits_nodes = set(
            bug_fixing_commit["rev"]
            for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["type"] in ("d", "r"))

        logger.info(
            f"{len(bug_fixing_commits_nodes)} bug-fixing commits to analyze")

        all_bug_ids = set(commit["bug_id"]
                          for commit in repository.get_commits())

        bug_map = {
            bug["id"]: bug
            for bug in bugzilla.get_bugs() if bug["id"] in all_bug_ids
        }

        logger.info(
            "Generate a map from files/functions to the bugs which were fixed/introduced by touching them"
        )

        # TODO: Support "moving" past bugs between files when they are renamed and between functions when they are
        # moved across files.

        by_dimensions = ["file", "directory", "component"]

        def dimension_to_field(dimension: str) -> str:
            return f"{dimension}s" if dimension != "directory" else "directories"

        past_regressions_by: dict[str, dict[str, list[int]]] = defaultdict(
            lambda: defaultdict(list))
        past_fixed_bugs_by: dict[str, dict[str, list[int]]] = defaultdict(
            lambda: defaultdict(list))
        past_regression_blocked_bugs_by: dict[str, dict[
            str, list[int]]] = defaultdict(lambda: defaultdict(list))
        past_fixed_bug_blocked_bugs_by: dict[str, dict[
            str, list[int]]] = defaultdict(lambda: defaultdict(list))
        past_regressions_by_function: dict[str, dict[
            str, list[int]]] = defaultdict(lambda: defaultdict(list))
        past_fixed_bugs_by_function: dict[str, dict[
            str, list[int]]] = defaultdict(lambda: defaultdict(list))
        past_regression_blocked_bugs_by_function: dict[str, dict[
            str, list[int]]] = defaultdict(lambda: defaultdict(list))
        past_fixed_bug_blocked_bugs_by_function: dict[str, dict[
            str, list[int]]] = defaultdict(lambda: defaultdict(list))

        for commit in tqdm(repository.get_commits()):
            if commit["bug_id"] not in bug_map:
                continue

            if commit["backedoutby"]:
                continue

            bug = bug_map[commit["bug_id"]]

            if len(bug["regressions"]) > 0:
                for dimension in by_dimensions:
                    for path in commit[dimension_to_field(dimension)]:
                        past_regressions_by[dimension][path].extend(
                            bug_id for bug_id in bug["regressions"]
                            if bug_id in bug_map)

                        past_regression_blocked_bugs_by[dimension][
                            path].extend(bugzilla.find_blocked_by(
                                bug_map, bug))

                for path, f_group in commit["functions"].items():
                    for f in f_group:
                        past_regressions_by_function[path][f["name"]].extend(
                            bug_id for bug_id in bug["regressions"]
                            if bug_id in bug_map)

                        past_regression_blocked_bugs_by_function[path][
                            f["name"]].extend(
                                bugzilla.find_blocked_by(bug_map, bug))

            if commit["node"] in bug_fixing_commits_nodes:
                for dimension in by_dimensions:
                    for path in commit[dimension_to_field(dimension)]:
                        past_fixed_bugs_by[dimension][path].append(bug["id"])

                        past_fixed_bug_blocked_bugs_by[dimension][path].extend(
                            bugzilla.find_blocked_by(bug_map, bug))

                for path, f_group in commit["functions"].items():
                    for f in f_group:
                        past_fixed_bugs_by_function[path][f["name"]].append(
                            bug["id"])

                        past_fixed_bug_blocked_bugs_by_function[path][
                            f["name"]].extend(
                                bugzilla.find_blocked_by(bug_map, bug))

        def _transform(bug_ids: list[int]) -> list[dict]:
            seen = set()
            results = []
            for bug_id in bug_ids:
                if bug_id in seen:
                    continue
                seen.add(bug_id)

                bug = bug_map[bug_id]
                results.append({
                    "id":
                    bug_id,
                    "summary":
                    bug["summary"],
                    "component":
                    "{}::{}".format(bug["product"], bug["component"]),
                })

            return results

        def past_bug_ids_to_summaries(
                past_bugs_by: dict[str, list[int]]) -> dict[str, list[dict]]:
            return {
                path: _transform(bug_ids)
                for path, bug_ids in past_bugs_by.items()
            }

        for dimension in by_dimensions:
            with open(f"data/past_regressions_by_{dimension}.json", "w") as f:
                json.dump(
                    past_bug_ids_to_summaries(past_regressions_by[dimension]),
                    f)
            zstd_compress(f"data/past_regressions_by_{dimension}.json")

            with open(f"data/past_fixed_bugs_by_{dimension}.json", "w") as f:
                json.dump(
                    past_bug_ids_to_summaries(past_fixed_bugs_by[dimension]),
                    f)
            zstd_compress(f"data/past_fixed_bugs_by_{dimension}.json")

            with open(f"data/past_regression_blocked_bugs_by_{dimension}.json",
                      "w") as f:
                json.dump(
                    past_bug_ids_to_summaries(
                        past_regression_blocked_bugs_by[dimension]),
                    f,
                )
            zstd_compress(
                f"data/past_regression_blocked_bugs_by_{dimension}.json")

            with open(f"data/past_fixed_bug_blocked_bugs_by_{dimension}.json",
                      "w") as f:
                json.dump(
                    past_bug_ids_to_summaries(
                        past_fixed_bug_blocked_bugs_by[dimension]),
                    f,
                )
            zstd_compress(
                f"data/past_fixed_bug_blocked_bugs_by_{dimension}.json")

        def past_function_bug_ids_to_summaries(
            past_bugs: dict[str, dict[str, list[int]]]
        ) -> dict[str, dict[str, list[dict]]]:
            return {
                path: {
                    func: _transform(bug_ids)
                    for func, bug_ids in funcs_bugs.items()
                }
                for path, funcs_bugs in past_bugs.items()
            }

        with open("data/past_regressions_by_function.json", "w") as f:
            json.dump(
                past_function_bug_ids_to_summaries(
                    past_regressions_by_function), f)
        zstd_compress("data/past_regressions_by_function.json")

        with open("data/past_fixed_bugs_by_function.json", "w") as f:
            json.dump(
                past_function_bug_ids_to_summaries(
                    past_fixed_bugs_by_function), f)
        zstd_compress("data/past_fixed_bugs_by_function.json")

        with open("data/past_regression_blocked_bugs_by_function.json",
                  "w") as f:
            json.dump(
                past_function_bug_ids_to_summaries(
                    past_regression_blocked_bugs_by_function),
                f,
            )
        zstd_compress("data/past_regression_blocked_bugs_by_function.json")

        with open("data/past_fixed_bug_blocked_bugs_by_function.json",
                  "w") as f:
            json.dump(
                past_function_bug_ids_to_summaries(
                    past_fixed_bug_blocked_bugs_by_function),
                f,
            )
        zstd_compress("data/past_fixed_bug_blocked_bugs_by_function.json")
Esempio n. 19
0
    def find_bug_fixing_commits(self):
        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download previous classifications...")
        db.download(BUG_FIXING_COMMITS_DB)

        logger.info("Get previously classified commits...")
        prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))
        prev_bug_fixing_commits_nodes = set(
            bug_fixing_commit["rev"]
            for bug_fixing_commit in prev_bug_fixing_commits)
        logger.info(
            f"Already classified {len(prev_bug_fixing_commits)} commits...")

        # TODO: Switch to the pure Defect model, as it's better in this case.
        logger.info("Downloading defect/enhancement/task model...")
        defect_model = download_and_load_model("defectenhancementtask")

        logger.info("Downloading regression model...")
        regression_model = download_and_load_model("regression")

        start_date = datetime.now() - RELATIVE_START_DATE
        end_date = datetime.now() - RELATIVE_END_DATE
        logger.info(
            f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
        )
        commit_map = defaultdict(list)
        for commit in repository.get_commits():
            if commit["node"] in prev_bug_fixing_commits_nodes:
                continue

            commit_date = dateutil.parser.parse(commit["pushdate"])
            if commit_date < start_date or commit_date > end_date:
                continue

            commit_map[commit["bug_id"]].append(commit["node"])

        logger.info(
            f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits"
        )
        assert len(commit_map) > 0

        def get_relevant_bugs():
            return (bug for bug in bugzilla.get_bugs()
                    if bug["id"] in commit_map)

        bug_count = sum(1 for bug in get_relevant_bugs())
        logger.info(
            f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing"
        )

        known_defect_labels = defect_model.get_labels()
        known_regression_labels = regression_model.get_labels()

        bug_fixing_commits = []

        def append_bug_fixing_commits(bug_id, type_):
            for commit in commit_map[bug_id]:
                bug_fixing_commits.append({"rev": commit, "type": type_})

        for bug in tqdm(get_relevant_bugs(), total=bug_count):
            # Ignore bugs which are not linked to the commits we care about.
            if bug["id"] not in commit_map:
                continue

            # If we know the label already, we don't need to apply the model.
            if (bug["id"] in known_regression_labels
                    and known_regression_labels[bug["id"]] == 1):
                append_bug_fixing_commits(bug["id"], "r")
                continue

            if bug["id"] in known_defect_labels:
                if known_defect_labels[bug["id"]] == "defect":
                    append_bug_fixing_commits(bug["id"], "d")
                else:
                    append_bug_fixing_commits(bug["id"], "e")
                continue

            if defect_model.classify(bug)[0] == "defect":
                if regression_model.classify(bug)[0] == 1:
                    append_bug_fixing_commits(bug["id"], "r")
                else:
                    append_bug_fixing_commits(bug["id"], "d")
            else:
                append_bug_fixing_commits(bug["id"], "e")

        db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
        zstd_compress(BUG_FIXING_COMMITS_DB)
        db.upload(BUG_FIXING_COMMITS_DB)
    def generate_push_data(self, runnable):
        def upload_adr_cache():
            cache_path = os.path.splitext(ADR_CACHE_DB)[0]
            assert os.path.abspath(
                adr.config["cache"]["stores"]["file"]["path"]
            ) == os.path.abspath(cache_path)

            with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar:
                tar.add(cache_path)

            db.upload(ADR_CACHE_DB)

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use half TRAINING_MONTHS months more than that to calculate the
        # failure statistics.
        from_months = TRAINING_MONTHS[runnable] + math.floor(
            TRAINING_MONTHS[runnable] / 2
        )

        pushes = mozci.push.make_push_objects(
            from_date=f"today-{from_months}month",
            to_date="today-3day",
            branch="autoland",
        )

        start_time = time.monotonic()

        num_cached = 0

        push_data = []

        for push in tqdm(pushes):
            key = f"push_data.{runnable}.{push.rev}"

            logger.info(f"Analyzing {push.rev} at the {runnable} level...")

            if adr.config.cache.has(key):
                num_cached += 1
                push_data.append(adr.config.cache.get(key))
            else:
                try:
                    if runnable == "label":
                        runnables = push.task_labels
                    elif runnable == "group":
                        runnables = push.group_summaries.keys()

                    value = [
                        push.revs,
                        list(runnables),
                        list(push.get_possible_regressions(runnable)),
                        list(push.get_likely_regressions(runnable)),
                    ]
                    push_data.append(value)
                    adr.config.cache.forever(key, value)
                except adr.errors.MissingDataError:
                    logger.warning(
                        f"Tasks for push {push.rev} can't be found on ActiveData"
                    )
                except Exception:
                    traceback.print_exc()

            if time.monotonic() - start_time >= 3600:
                upload_adr_cache()
                start_time = time.monotonic()

        logger.info(f"{num_cached} pushes were already cached out of {len(pushes)}")

        upload_adr_cache()

        with open(f"push_data_{runnable}.json", "w") as f:
            json.dump(push_data, f)

        zstd_compress(f"push_data_{runnable}.json")
    def generate_test_scheduling_history(self, granularity: str,
                                         training_months: int) -> None:
        if granularity != "config_group":
            # Get the commits DB.
            assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=training_months)

        if granularity == "label":
            test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_LABEL_DB)
            failing_together_db = os.path.join(
                "data", test_scheduling.FAILING_TOGETHER_LABEL_DB)
        elif granularity == "group":
            test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_GROUP_DB)
            touched_together_db = os.path.join(
                "data", test_scheduling.TOUCHED_TOGETHER_DB)
        elif granularity == "config_group":
            test_scheduling_db = test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_CONFIG_GROUP_DB)
            failing_together_db = os.path.join(
                "data", test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB)

        push_data_iter, push_data_count, all_runnables = test_scheduling.get_push_data(
            granularity)

        if granularity in ("label", "config_group"):
            test_scheduling.generate_failing_together_probabilities(
                granularity, push_data_iter(), push_data_count)

        def generate_all_data() -> Generator[Dict[str, Any], None, None]:
            past_failures = test_scheduling.get_past_failures(
                granularity, False)

            push_num = past_failures[
                "push_num"] if "push_num" in past_failures else 0

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                commit_map[commit_data["node"]] = commit_data

            # Store all runnables in the past_failures DB so it can be used in the evaluation phase.
            past_failures["all_runnables"] = all_runnables
            # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_runnables = 0

            if granularity in ("group", "config_group"):
                update_touched_together_gen = test_scheduling.update_touched_together(
                )
                next(update_touched_together_gen)

            for (
                    i,
                (
                    revisions,
                    fix_revision,
                    push_runnables,
                    possible_regressions,
                    likely_regressions,
                ),
            ) in enumerate(tqdm(push_data_iter(), total=push_data_count)):
                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision) for revision in revisions
                    if revision in commit_map)
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 50:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_runnables, we'd generate a huge amount of data.
                # We consider only the runnables which run in this push, and the possible and likely regressions
                # from this push. We can't consider all runnables because we can't be sure that a task that didn't
                # run on a push would have been successful.
                runnables_to_consider = list(
                    set(push_runnables + possible_regressions +
                        likely_regressions))

                if len(runnables_to_consider) == 0:
                    skipped_no_runnables += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                if granularity in ("group", "config_group"):
                    update_touched_together_gen.send(commits[0]["node"])

                result_data = []
                for data in test_scheduling.generate_data(
                        granularity,
                        past_failures,
                        merged_commits,
                        push_num,
                        runnables_to_consider,
                        possible_regressions,
                        likely_regressions,
                ):
                    if pushdate > HISTORY_DATE_START:
                        result_data.append(data)

                if pushdate > HISTORY_DATE_START:
                    saved_nodes.add(i)
                    yield {
                        "revs": revisions,
                        "data": result_data,
                    }

            if granularity == "group":
                try:
                    update_touched_together_gen.send(None)
                except StopIteration:
                    pass

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(
                f"skipped {skipped_no_runnables} (no interesting runnables)")

            past_failures["push_num"] = push_num
            past_failures.close()

        # For the config/group granularity, we are only interested in the failing together DB.
        if granularity != "config_group":
            db.append(test_scheduling_db, generate_all_data())

            zstd_compress(test_scheduling_db)
            create_tar_zst(past_failures_db)

        if granularity == "group":
            create_tar_zst(touched_together_db)

        if granularity in ("label", "config_group"):
            create_tar_zst(failing_together_db)
Esempio n. 22
0
 def compress_and_upload() -> None:
     utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB)
     db.upload(SHADOW_SCHEDULER_STATS_DB)
Esempio n. 23
0
    def retrieve_bugs(self, limit: int = None) -> None:
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = set(
            bugzilla.get_ids({
                "f1": "delta_ts",
                "o1": "greaterthaneq",
                "v1": last_modified.date()
            }))
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        all_components = bugzilla.get_product_component_count(9999)

        deleted_component_ids = set(
            bug["id"] for bug in bugzilla.get_bugs() if "{}::{}".format(
                bug["product"], bug["component"]) not in all_components)
        logger.info(
            f"{len(deleted_component_ids)} bugs belonging to deleted components"
        )
        changed_ids |= deleted_component_ids

        # Get IDs of bugs between (two years and six months ago) and now.
        two_years_and_six_months_ago = datetime.utcnow() - relativedelta(
            years=2, months=6)
        logger.info(f"Retrieving bug IDs since {two_years_and_six_months_ago}")
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago)
        if limit:
            timespan_ids = timespan_ids[-limit:]
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        if limit:
            labelled_bug_ids = labelled_bug_ids[-limit:]
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
        # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped.
        if limit is None:
            assert db.download(repository.COMMITS_DB)

        # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
        start_date = datetime.now() - relativedelta(years=3)
        commit_bug_ids = list(
            set(commit["bug_id"] for commit in repository.get_commits()
                if commit["bug_id"]
                and dateutil.parser.parse(commit["pushdate"]) >= start_date))
        if limit:
            commit_bug_ids = commit_bug_ids[-limit:]
        logger.info(
            f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which are regressions, bugs which caused regressions (useful for the regressor model),
        # and blocked bugs.
        regression_related_ids: List[int] = list(
            set(
                sum(
                    (bug["regressed_by"] + bug["regressions"] + bug["blocks"]
                     for bug in bugzilla.get_bugs()),
                    [],
                )))
        if limit:
            regression_related_ids = regression_related_ids[-limit:]
        logger.info(
            f"{len(regression_related_ids)} bugs which caused regressions fixed by commits."
        )

        # Get IDs of bugs linked to intermittent failures.
        test_failure_bug_ids = [
            item["bug_id"] for item in test_scheduling.get_failure_bugs(
                two_years_and_six_months_ago, datetime.utcnow())
        ]
        if limit:
            test_failure_bug_ids = test_failure_bug_ids[-limit:]
        logger.info(f"{len(test_failure_bug_ids)} bugs about test failures.")

        all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids +
                   regression_related_ids + test_failure_bug_ids)
        all_ids_set = set(all_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"]
                             not in all_ids_set)

        new_bugs = bugzilla.download_bugs(all_ids)

        # Get regression_related_ids again (the set could have changed after downloading new bugs).
        for i in range(7):
            regression_related_ids = list(
                set(
                    sum(
                        (bug["regressed_by"] + bug["regressions"] +
                         bug["blocks"] for bug in new_bugs),
                        [],
                    )))
            logger.info(
                f"{len(regression_related_ids)} bugs which caused regressions fixed by commits."
            )
            if limit:
                regression_related_ids = regression_related_ids[-limit:]

            # If we got all bugs we needed, break.
            if set(regression_related_ids).issubset(all_ids):
                break

            new_bugs = bugzilla.download_bugs(regression_related_ids)

        # Try to re-download inconsistent bugs, up to twice.
        inconsistent_bugs = bugzilla.get_bugs(include_invalid=True)
        for i in range(2):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        # TODO: Figure out why.
        missing_history_bug_ids = {
            bug["id"]
            for bug in bugzilla.get_bugs() if "history" not in bug
        }
        bugzilla.delete_bugs(lambda bug: bug["id"] in missing_history_bug_ids)
        logger.info(
            f"Deleted {len(missing_history_bug_ids)} bugs as we couldn't retrieve their history"
        )

        zstd_compress(bugzilla.BUGS_DB)
Esempio n. 24
0
    def retrieve_test_scheduling_history(self):
        os.makedirs("data", exist_ok=True)

        # Download previous cache.
        cache_path = os.path.abspath("data/adr_cache")
        if not os.path.exists(cache_path):
            try:
                download_check_etag(URL, "adr_cache.tar.xz")
                with tarfile.open("adr_cache.tar.xz", "r:xz") as tar:
                    tar.extractall()
                assert os.path.exists(
                    "data/adr_cache"), "Decompressed adr cache exists"
            except requests.exceptions.HTTPError:
                logger.info("The adr cache is not available yet")

        # Setup adr cache configuration.
        os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True)
        with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f:
            f.write(f"""[adr.cache.stores]
file = {{ driver = "file", path = "{cache_path}" }}
            """)

        # Get the commits DB.
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use 3 months more than that to calculate the failure statistics.
        subprocess.run(
            [
                "run-adr",
                "ahal/ci-recipes",
                "recipe",
                "-o",
                os.path.abspath("push_data.json"),
                "-f",
                "json",
                "push_data",
                "--",
                "--from",
                f"today-{TRAINING_MONTHS + 3}month",
                "--to",
                "today-2day",
                "--branch",
                "autoland",
            ],
            check=True,
            stdout=subprocess.
            DEVNULL,  # Redirect to /dev/null, as the logs are too big otherwise.
        )

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        HISTORICAL_TIMESPAN = 56

        past_failures = {}

        def get_past_failures(task, push_num):
            if task not in past_failures:
                past_failures[task] = repository.exp_queue(
                    push_num, HISTORICAL_TIMESPAN + 1, 0)

            return past_failures[task][push_num]

        def generate_data():
            commits_with_data = set()
            saved_nodes = set()

            push_num = 0
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
                        continue

                    total_failures = get_past_failures(task, push_num)
                    past_7_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 7)
                    past_14_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 14)
                    past_28_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 28)
                    past_56_pushes_failures = total_failures - get_past_failures(
                        task, push_num - 56)

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "is_possible_regression": task
                            in commit_push_data[1],
                            "is_likely_regression": task
                            in commit_push_data[2],
                        }

                    if task in commit_push_data[1] or task in commit_push_data[
                            2]:
                        past_failures[task][push_num] = total_failures + 1

                push_num += 1

            logger.info(f"push data nodes: {len(push_data)}")

            logger.info(
                f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.write(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        with tarfile.open("data/adr_cache.tar.xz", "w:xz") as tar:
            tar.add("data/adr_cache")
    def generate_test_scheduling_history(self, granularity):
        push_data_path = f"push_data_{granularity}.json"
        updated = download_check_etag(
            test_scheduling.PUSH_DATA_URL.format(granularity=granularity))
        if updated:
            zstd_decompress(push_data_path)
            os.remove(f"{push_data_path}.zst")
        assert os.path.exists(
            push_data_path), "Decompressed push data file exists"

        # Get the commits DB.
        assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS[granularity])

        if granularity == "label":
            test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_LABEL_DB)
            failing_together_db = os.path.join(
                "data", test_scheduling.FAILING_TOGETHER_LABEL_DB)
        elif granularity == "group":
            test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_GROUP_DB)
            touched_together_db = os.path.join(
                "data", test_scheduling.TOUCHED_TOGETHER_DB)

        db.download(test_scheduling_db, support_files_too=True)

        last_node = None
        for revs, _ in test_scheduling.get_test_scheduling_history(
                granularity):
            last_node = revs[0]

        def generate_failing_together_probabilities(push_data):
            # TODO: we should consider the probabilities of `task1 failure -> task2 failure` and
            # `task2 failure -> task1 failure` separately, as they could be different.

            count_runs = collections.Counter()
            count_single_failures = collections.Counter()
            count_both_failures = collections.Counter()

            for revisions, tasks, likely_regressions, candidate_regressions in tqdm(
                    push_data):
                failures = set(likely_regressions + candidate_regressions)
                all_tasks = list(set(tasks) | failures)

                for task1, task2 in itertools.combinations(
                        sorted(all_tasks), 2):
                    count_runs[(task1, task2)] += 1

                    if task1 in failures:
                        if task2 in failures:
                            count_both_failures[(task1, task2)] += 1
                        else:
                            count_single_failures[(task1, task2)] += 1
                    elif task2 in failures:
                        count_single_failures[(task1, task2)] += 1

            stats = {}

            skipped = 0

            for couple, run_count in count_runs.most_common():
                failure_count = count_both_failures[couple]
                support = failure_count / run_count

                if support < 1 / 700:
                    skipped += 1
                    continue

                if failure_count != 0:
                    confidence = failure_count / (
                        count_single_failures[couple] + failure_count)
                else:
                    confidence = 0.0

                stats[couple] = (support, confidence)

            logger.info(
                f"{skipped} couples skipped because their support was too low")

            logger.info(
                "Redundancies with the highest support and confidence:")
            for couple, (support,
                         confidence) in sorted(stats.items(),
                                               key=lambda k:
                                               (-k[1][1], -k[1][0]))[:7]:
                failure_count = count_both_failures[couple]
                run_count = count_runs[couple]
                logger.info(
                    f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})."
                )

            logger.info(
                "Redundancies with the highest confidence and lowest support:")
            for couple, (support,
                         confidence) in sorted(stats.items(),
                                               key=lambda k:
                                               (-k[1][1], k[1][0]))[:7]:
                failure_count = count_both_failures[couple]
                run_count = count_runs[couple]
                logger.info(
                    f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})."
                )

            failing_together = test_scheduling.get_failing_together_db()
            count_redundancies = collections.Counter()
            for couple, (support, confidence) in stats.items():
                if confidence == 1.0:
                    count_redundancies["==100%"] += 1
                if confidence > 0.9:
                    count_redundancies[">=90%"] += 1
                if confidence > 0.8:
                    count_redundancies[">=80%"] += 1
                if confidence > 0.7:
                    count_redundancies[">=70%"] += 1

                if confidence < 0.7:
                    continue

                failing_together[f"{couple[0]}${couple[1]}".encode(
                    "utf-8")] = struct.pack("ff", support, confidence)

            for percentage, count in count_redundancies.most_common():
                logger.info(f"{count} with {percentage} confidence")

            test_scheduling.close_failing_together_db()

        def generate_all_data():
            past_failures = test_scheduling.get_past_failures(granularity)

            push_num = past_failures[
                "push_num"] if "push_num" in past_failures else 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            with open(push_data_path, "r") as f:
                push_data = json.load(f)

            logger.info(f"push data nodes: {len(push_data)}")

            if granularity == "label":
                push_data = [(
                    revisions,
                    rename_tasks(push_tasks),
                    rename_tasks(possible_regressions),
                    rename_tasks(likely_regressions),
                ) for revisions, push_tasks, possible_regressions,
                             likely_regressions in push_data]

            # In the last 14 pushes, we definitely run all possible runnables.
            all_runnables_set = set(
                sum((push_runnables
                     for _, push_runnables, _, _ in push_data[-14:]), []))
            # Filter runnables we don't need.
            all_runnables = filter_runnables(list(all_runnables_set),
                                             all_runnables_set, granularity)
            all_runnables_set = set(all_runnables_set)
            logger.info(
                f"{len(all_runnables_set)} runnables run in the last 14 pushes"
            )

            push_data = [(
                revisions,
                filter_runnables(push_tasks, all_runnables_set, granularity),
                filter_runnables(possible_regressions, all_runnables_set,
                                 granularity),
                filter_runnables(likely_regressions, all_runnables_set,
                                 granularity),
            ) for revisions, push_tasks, possible_regressions,
                         likely_regressions in push_data]

            if granularity == "label":
                generate_failing_together_probabilities(push_data)

            # Store all runnables in the past_failures DB so it can be used in the evaluation phase.
            past_failures["all_runnables"] = all_runnables
            # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_runnables = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            if granularity == "group":
                update_touched_together_gen = test_scheduling.update_touched_together(
                )
                next(update_touched_together_gen)

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_runnables,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision) for revision in revisions
                    if revision in commit_map)
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 50:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_runnables, we'd generate a huge amount of data.
                # We consider only the runnables which run in this push, and the possible and likely regressions
                # from this push. We can't consider all runnables because we can't be sure that a task that didn't
                # run on a push would have been successful.
                runnables_to_consider = list(
                    set(push_runnables + possible_regressions +
                        likely_regressions))

                if len(runnables_to_consider) == 0:
                    skipped_no_runnables += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                if granularity == "group":
                    update_touched_together_gen.send(commits[0]["node"])

                result = {
                    "revs": revisions,
                    "data": [],
                }
                for data in test_scheduling.generate_data(
                        past_failures,
                        merged_commits,
                        push_num,
                        runnables_to_consider,
                        possible_regressions,
                        likely_regressions,
                ):
                    if pushdate > HISTORY_DATE_START:
                        result["data"].append(data)

                if pushdate > HISTORY_DATE_START:
                    saved_nodes.add(i)
                    yield result

            if granularity == "group":
                try:
                    update_touched_together_gen.send(None)
                except StopIteration:
                    pass

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(
                f"skipped {skipped_no_runnables} (no interesting runnables)")

            past_failures["push_num"] = push_num
            past_failures.close()

        db.append(test_scheduling_db, generate_all_data())

        zstd_compress(test_scheduling_db)
        create_tar_zst(past_failures_db)

        if granularity == "group":
            create_tar_zst(touched_together_db)

        if granularity == "label":
            create_tar_zst(failing_together_db)
    def generate_push_data(
        self, pushes: Tuple[mozci.push.Push, ...], granularity: str
    ) -> None:
        from_date = get_from_date(granularity)

        pushes = tuple(
            push for push in pushes if datetime.utcfromtimestamp(push.date) >= from_date
        )

        if granularity == "label":
            push_data_db = test_scheduling.PUSH_DATA_LABEL_DB
        elif granularity == "group":
            push_data_db = test_scheduling.PUSH_DATA_GROUP_DB
        elif granularity == "config_group":
            push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB

        def cache_key(push: mozci.push.Push) -> str:
            return f"push_data.{granularity}.{push.rev}"

        def generate(
            futures: List[concurrent.futures.Future],
        ) -> Generator[PushResult, None, None]:
            num_cached = 0
            num_pushes = len(pushes)

            # Regenerating a large amount of data when we update the mozci regression detection
            # algorithm is currently pretty slow, so we only regenerate 1000 pushes whenever we
            # run.
            to_regenerate = 1000

            for push in tqdm(pushes):
                cached = futures.pop(0).result()

                semaphore.release()

                if cached and to_regenerate > 0:
                    value, mozci_version = cached

                    # Regenerate results which were generated when we were not cleaning
                    # up WPT groups.
                    if granularity == "group" and any(
                        runnable.startswith("/") for runnable in value[1]
                    ):
                        cached = None
                        to_regenerate -= 1

                    # Regenerate results which were generated when we didn't get a correct
                    # configuration for test-verify tasks.
                    elif granularity == "config_group" and any(
                        "test-verify" in runnable[0] for runnable in value[1]
                    ):
                        cached = None
                        to_regenerate -= 1

                    """# Regenerate results which were generated with an older version of mozci.
                    elif mozci_version != MOZCI_VERSION:
                        cached = None
                        to_regenerate -= 1"""

                if cached:
                    num_cached += 1
                    value, mozci_version = cached
                    yield value
                else:
                    logger.info(f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.task_labels
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            push.revs,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        adr.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            adr.config["cache"]["retention"],
                        )
                        yield value
                    except adr.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                    except Exception:
                        traceback.print_exc()

            logger.info(f"{num_cached} pushes were already cached out of {num_pushes}")

        semaphore = threading.BoundedSemaphore(256)

        def retrieve_from_cache(push):
            semaphore.acquire()
            return adr.config.cache.get(cache_key(push))

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(retrieve_from_cache, push) for push in pushes]

            try:
                db.write(push_data_db, generate(futures))
            except Exception:
                for f in futures:
                    f.cancel()

                    try:
                        semaphore.release()
                    except ValueError:
                        continue

                raise

        zstd_compress(push_data_db)
Esempio n. 27
0
    def generate_push_data(self, runnable):
        def upload_adr_cache():
            cache_path = os.path.splitext(ADR_CACHE_DB)[0]
            assert os.path.abspath(adr.config["cache"]["stores"]["file"]
                                   ["path"]) == os.path.abspath(cache_path)

            with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar:
                tar.add(cache_path)

            db.upload(ADR_CACHE_DB)

        # We keep in the cache the fact that we failed to analyze a push for 10
        # days, so if we re-run often we don't retry the same pushes many times.
        MISSING_CACHE_RETENTION = 10 * 24 * 60

        # We'll use the past TRAINING_MONTHS months only for training the model,
        # but we use half TRAINING_MONTHS months more than that to calculate the
        # failure statistics.
        from_months = TRAINING_MONTHS[runnable] + math.floor(
            TRAINING_MONTHS[runnable] / 2)

        pushes = mozci.push.make_push_objects(
            from_date=f"today-{from_months}month",
            to_date="today-3day",
            branch="autoland",
        )

        start_time = time.monotonic()

        num_cached = 0

        push_data = []

        for push in tqdm(pushes):
            key = f"push_data.{runnable}.{push.rev}"

            logger.info(f"Analyzing {push.rev} at the {runnable} level...")

            if adr.config.cache.has(key):
                num_cached += 1
                cached = adr.config.cache.get(key)
                if cached:
                    # XXX: We have to support items in the cache that were added
                    # before the mozci version was stored. We can drop the if
                    # when all items have been switched over.
                    value = cached[0] if isinstance(cached, tuple) else cached
                    push_data.append(value)
            else:
                try:
                    if runnable == "label":
                        runnables = push.task_labels
                    elif runnable == "group":
                        runnables = push.group_summaries.keys()

                    value = [
                        push.revs,
                        list(runnables),
                        list(push.get_possible_regressions(runnable)),
                        list(push.get_likely_regressions(runnable)),
                    ]
                    push_data.append(value)
                    adr.config.cache.forever(key, (value, MOZCI_VERSION))
                except adr.errors.MissingDataError:
                    logger.warning(
                        f"Tasks for push {push.rev} can't be found on ActiveData"
                    )
                    adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)
                except Exception:
                    traceback.print_exc()
                    adr.config.cache.put(key, (), MISSING_CACHE_RETENTION)

            if time.monotonic() - start_time >= 3600:
                upload_adr_cache()
                start_time = time.monotonic()

        logger.info(
            f"{num_cached} pushes were already cached out of {len(pushes)}")

        upload_adr_cache()

        with open(f"push_data_{runnable}.json", "w") as f:
            json.dump(push_data, f)

        zstd_compress(f"push_data_{runnable}.json")
Esempio n. 28
0
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"
            ), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(repository.COMMITS_DB) or not db.exists(
            repository.COMMITS_DB
        ):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["rev"]
        else:
            last_node = None

        try:
            with open("data/past_failures.pickle", "rb") as f:
                past_failures, push_num = pickle.load(f)
        except FileNotFoundError:
            past_failures = {}
            push_num = 0

        def get_and_update_past_failures(type_, task, items, push_num, is_regression):
            if type_ not in past_failures:
                past_failures[type_] = {}

            if task not in past_failures[type_]:
                past_failures[type_][task] = {}

            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            for item in items:
                if item not in past_failures[type_][task]:
                    past_failures[type_][task][item] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0
                    )

                value = past_failures[type_][task][item][push_num]

                values_total.append(value)
                values_prev_7.append(
                    value - past_failures[type_][task][item][push_num - 7]
                )
                values_prev_14.append(
                    value - past_failures[type_][task][item][push_num - 14]
                )
                values_prev_28.append(
                    value - past_failures[type_][task][item][push_num - 28]
                )
                values_prev_56.append(
                    value - past_failures[type_][task][item][push_num - 56]
                )

                if is_regression:
                    past_failures[type_][task][item][push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            commits_with_data = set()
            saved_nodes = set()

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                if node == last_node:
                    can_start = True
                    continue

                if not can_start:
                    continue

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
                        continue

                    is_regression = (
                        task in commit_push_data[1] or task in commit_push_data[2]
                    )

                    total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures(
                        "all", task, ["all"], push_num, is_regression
                    )

                    total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures(
                        "type", task, commit_data["types"], push_num, is_regression
                    )

                    total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures(
                        "file", task, commit_data["files"], push_num, is_regression
                    )

                    total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures(
                        "directory",
                        task,
                        commit_data["directories"],
                        push_num,
                        is_regression,
                    )

                    total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures(
                        "component",
                        task,
                        commit_data["components"],
                        push_num,
                        is_regression,
                    )

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types": past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types": past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types": past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types": past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files": past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files": past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files": past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files": past_56_pushes_files_failures,
                            "failures_in_directories": total_directories_failures,
                            "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures,
                            "failures_in_components": total_components_failures,
                            "failures_past_7_pushes_in_components": past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components": past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components": past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components": past_56_pushes_components_failures,
                            "is_possible_regression": task in commit_push_data[1],
                            "is_likely_regression": task in commit_push_data[2],
                        }

                push_num += 1

            logger.info(f"push data nodes: {len(push_data)}")

            logger.info(f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        with open("data/past_failures.pickle", "wb") as f:
            pickle.dump((past_failures, push_num), f, protocol=pickle.HIGHEST_PROTOCOL)

        zstd_compress("data/past_failures.pickle")