Exemple #1
0
    def download_issues(self) -> None:
        # Fetches all issues sorted by date of creation in ascending order
        url = "https://api.github.com/repos/{}/{}/issues".format(
            self.owner, self.repo)
        start_page = self.get_start_page()

        params = {
            "state": self.state,
            "sort": "created",
            "direction": "asc",
            "per_page": PER_PAGE,
            "page": start_page,
        }

        data, response_links = self.fetch_issues(
            url=url, retrieve_events=self.retrieve_events, params=params)

        db.append(self.db_path, data)
        # Fetch next page
        while "next" in response_links.keys():
            next_page_data, response_links = self.fetch_issues(
                response_links["next"]["url"], self.retrieve_events)
            db.append(self.db_path, next_page_data)

        logger.info("Done downloading")
    def get_commits_to_ignore(self):
        logger.info("Download previous commits to ignore...")
        if db.is_old_version(
                IGNORED_COMMITS_DB) or not db.exists(IGNORED_COMMITS_DB):
            db.download(IGNORED_COMMITS_DB, force=True)

        logger.info("Get previously classified commits...")
        prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB))
        logger.info(
            f"Already found {len(prev_commits_to_ignore)} commits to ignore..."
        )

        if len(prev_commits_to_ignore) > 0:
            rev_start = "children({})".format(
                prev_commits_to_ignore[-1]["rev"])
        else:
            rev_start = 0

        # 2 days more than the end date, so we can know if a commit was backed-out.
        # We have to do this as recent commits might be missing in the mercurial <-> git map,
        # otherwise we could just use "tip".
        end_date = datetime.now() - RELATIVE_END_DATE + relativedelta(2)
        with hglib.open(self.mercurial_repo_dir) as hg:
            revs = repository.get_revs(
                hg, rev_start,
                "pushdate('{}')".format(end_date.strftime("%Y-%m-%d")))

        # Given that we use the pushdate, there might be cases where the starting commit is returned too (e.g. if we rerun the task on the same day).
        if len(prev_commits_to_ignore) > 0:
            found_prev = -1
            for i, rev in enumerate(revs):
                if rev.decode("utf-8") == prev_commits_to_ignore[-1]["rev"]:
                    found_prev = i
                    break
            revs = revs[found_prev + 1:]

        commits = repository.hg_log_multi(self.mercurial_repo_dir, revs)

        repository.set_commits_to_ignore(self.mercurial_repo_dir, commits)
        commits_to_ignore = []

        for commit in commits:
            if commit.ignored or commit.backedoutby:
                commits_to_ignore.append({
                    "rev":
                    commit.node,
                    "type":
                    "backedout" if commit.backedoutby else "",
                })

        logger.info(f"{len(commits_to_ignore)} new commits to ignore...")

        logger.info("...of which {} are backed-out".format(
            sum(1 for commit in commits_to_ignore
                if commit["type"] == "backedout")))

        db.append(IGNORED_COMMITS_DB, commits_to_ignore)
        zstd_compress(IGNORED_COMMITS_DB)

        return prev_commits_to_ignore + commits_to_ignore
Exemple #3
0
def download_issues(owner: str,
                    repo: str,
                    state: str,
                    retrieve_events: bool = False) -> None:
    # Fetches all issues sorted by date of creation in ascending order
    url = "https://api.github.com/repos/{}/{}/issues".format(owner, repo)
    start_page = get_start_page()

    params = {
        "state": state,
        "sort": "created",
        "direction": "asc",
        "per_page": PER_PAGE,
        "page": start_page,
    }

    data, response_links = fetch_issues(url=url,
                                        retrieve_events=retrieve_events,
                                        params=params)

    db.append(GITHUB_ISSUES_DB, data)
    # Fetch next page
    while "next" in response_links.keys():
        next_page_data, response_links = fetch_issues(
            response_links["next"]["url"], retrieve_events)
        db.append(GITHUB_ISSUES_DB, next_page_data)

    logger.info("Done downloading")
Exemple #4
0
def download_bugs(bug_ids, products=None, security=False):
    old_bug_count = 0
    old_bugs = []
    new_bug_ids = set(int(bug_id) for bug_id in bug_ids)
    for bug in get_bugs():
        old_bug_count += 1
        if int(bug['id']) in new_bug_ids:
            old_bugs.append(bug)
            new_bug_ids.remove(bug['id'])

    print(f'Loaded {old_bug_count} bugs.')
    print(f'To download {len(new_bug_ids)} bugs.')

    new_bug_ids = sorted(list(new_bug_ids))

    total_downloaded = 0
    chunks = (new_bug_ids[i:(i + 500)] for i in range(0, len(new_bug_ids), 500))
    for chunk in chunks:
        new_bugs = _download(chunk)

        total_downloaded += len(new_bugs)

        print(f'Downloaded {total_downloaded} bugs')

        if not security:
            new_bugs = {bug_id: bug for bug_id, bug in new_bugs.items() if len(bug['groups']) == 0}

        if products is not None:
            new_bugs = {bug_id: bug for bug_id, bug in new_bugs.items() if bug['product'] in products}

        db.append(BUGS_DB, new_bugs.values())
Exemple #5
0
def go(days: int) -> None:
    logger.info("Download previous shadow scheduler statistics...")
    db.download(SHADOW_SCHEDULER_STATS_DB)

    logger.info("Get previously gathered statistics...")
    prev_scheduler_stat_revs = set(
        scheduler_stat["id"]
        for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB))
    logger.info(
        f"Already gathered statistics for {len(prev_scheduler_stat_revs)} pushes..."
    )

    to_date = datetime.utcnow() - relativedelta(days=3)
    from_date = to_date - relativedelta(days=days)
    pushes = mozci.push.make_push_objects(
        from_date=from_date.strftime("%Y-%m-%d"),
        to_date=to_date.strftime("%Y-%m-%d"),
        branch="autoland",
    )

    pushes = [
        push for push in pushes if push.rev not in prev_scheduler_stat_revs
    ]

    logger.info(f"{len(pushes)} left to analyze")

    db.append(SHADOW_SCHEDULER_STATS_DB, analyze_shadow_schedulers(pushes))
    utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB)
Exemple #6
0
def download_commits(repo_dir, rev_start=0, ret=False, save=True):
    hg = hglib.open(repo_dir)

    revs = get_revs(hg, rev_start)

    assert (
        len(revs) > 0
    ), "There should definitely be more than 0 commits, something is wrong"

    first_pushdate = hg_log(hg, [b"0"])[0].pushdate

    hg.close()

    processes = multiprocessing.cpu_count()

    print(f"Mining {len(revs)} commits using {processes} processes...")

    CHUNK_SIZE = 256
    revs_groups = [
        revs[i:(i + CHUNK_SIZE)] for i in range(0, len(revs), CHUNK_SIZE)
    ]

    with concurrent.futures.ProcessPoolExecutor(
            initializer=_init, initargs=(repo_dir, )) as executor:
        commits = executor.map(_hg_log, revs_groups, chunksize=20)
        commits = tqdm(commits, total=len(revs_groups))
        commits = list(itertools.chain.from_iterable(commits))

    print("Downloading file->component mapping...")

    download_component_mapping()

    commits_to_ignore = get_commits_to_ignore(repo_dir, commits)
    print(f"{len(commits_to_ignore)} commits to ignore")

    calculate_experiences(commits, commits_to_ignore, first_pushdate, save)

    # Exclude commits to ignore.
    commits = [commit for commit in commits if commit not in commits_to_ignore]

    commits_num = len(commits)

    print(f"Mining {commits_num} commits using {processes} processes...")

    global rs_parsepatch
    import rs_parsepatch

    with concurrent.futures.ProcessPoolExecutor(
            initializer=_init, initargs=(repo_dir, )) as executor:
        commits = executor.map(_transform, commits, chunksize=64)
        commits = tqdm(commits, total=commits_num)

        if ret:
            commits = list(commits)

        if save:
            db.append(COMMITS_DB, commits)

        if ret:
            return commits
Exemple #7
0
def download_bugs(bug_ids, products=None, security=False):
    old_bug_count = 0
    old_bugs = []
    new_bug_ids = set(int(bug_id) for bug_id in bug_ids)
    for bug in get_bugs():
        old_bug_count += 1
        if int(bug['id']) in new_bug_ids:
            old_bugs.append(bug)
            new_bug_ids.remove(bug['id'])

    print(f'Loaded {old_bug_count} bugs.')

    new_bug_ids = sorted(list(new_bug_ids))

    CHUNK_SIZE = 100

    chunks = (new_bug_ids[i:(i + CHUNK_SIZE)] for i in range(0, len(new_bug_ids), CHUNK_SIZE))
    with tqdm(total=len(new_bug_ids)) as progress_bar:
        for chunk in chunks:
            new_bugs = _download(chunk)

            progress_bar.update(len(chunk))

            if not security:
                new_bugs = {bug_id: bug for bug_id, bug in new_bugs.items() if len(bug['groups']) == 0}

            if products is not None:
                new_bugs = {bug_id: bug for bug_id, bug in new_bugs.items() if bug['product'] in products}

            db.append(BUGS_DB, new_bugs.values())
Exemple #8
0
    def retrieve_issues(self, owner: str, repo: str, state: str,
                        retrieve_events: bool) -> None:

        last_modified = None
        db.download(github.GITHUB_ISSUES_DB)

        try:
            last_modified = db.last_modified(github.GITHUB_ISSUES_DB)
        except Exception:
            pass

        if last_modified:
            logger.info(
                f"Retrieving issues modified or created since the last run on {last_modified.isoformat()}"
            )
            data = github.fetch_issues_updated_since_timestamp(
                owner, repo, state, last_modified.isoformat(), retrieve_events)

            updated_ids = set(issue["id"] for issue in data)

            logger.info(
                "Deleting issues that were changed since the last run and saving updates"
            )
            github.delete_issues(lambda issue: issue["id"] in updated_ids)

            db.append(github.GITHUB_ISSUES_DB, data)
            logger.info("Updating finished")
        else:
            logger.info(
                "Retrieving all issues since last_modified is not available")
            github.download_issues(owner, repo, state, retrieve_events)

        zstd_compress(github.GITHUB_ISSUES_DB)
Exemple #9
0
def test_bad_format_compression(tmp_path, db_name):
    db_path = tmp_path / db_name
    db.register(db_path, "https://alink", 1)

    with pytest.raises(AssertionError):
        db.write(db_path, range(7))

    with pytest.raises(AssertionError):
        db.append(db_path, range(7))
Exemple #10
0
def test_bad_format_compression(tmp_path, db_name):
    db_path = tmp_path / db_name
    db.register(db_path, "https://alink")

    with pytest.raises(AssertionError):
        db.write(db_path, range(7))

    with pytest.raises(AssertionError):
        db.append(db_path, range(7))
Exemple #11
0
def test_append(mock_db, db_format, db_compression):
    db_path = mock_db(db_format, db_compression)

    db.write(db_path, range(1, 4))

    assert list(db.read(db_path)) == [1, 2, 3]

    db.append(db_path, range(4, 8))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
Exemple #12
0
def test_append(mock_db, db_format, db_compression):
    db_path = mock_db(db_format, db_compression)

    db.write(db_path, range(1, 4))

    assert list(db.read(db_path)) == [1, 2, 3]

    db.append(db_path, range(4, 8))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
Exemple #13
0
def download_commits(repo_dir, rev_start=0, save=True, use_single_process=False):
    with hglib.open(repo_dir) as hg:
        revs = get_revs(hg, rev_start)
        if len(revs) == 0:
            print("No commits to analyze")
            return []

    first_pushdate = get_first_pushdate(repo_dir)

    print(f"Mining {len(revs)} commits...")

    if not use_single_process:
        print(f"Using {os.cpu_count()} processes...")
        commits = hg_log_multi(repo_dir, revs)
    else:
        with hglib.open(repo_dir) as hg:
            commits = hg_log(hg, revs)

    print("Downloading file->component mapping...")

    download_component_mapping()

    set_commits_to_ignore(repo_dir, commits)

    commits_num = len(commits)

    print(f"Mining {commits_num} commits...")

    global rs_parsepatch
    import rs_parsepatch

    global code_analysis_server
    code_analysis_server = rust_code_analysis_server.RustCodeAnalysisServer()

    if not use_single_process:
        with concurrent.futures.ProcessPoolExecutor(
            initializer=_init_process, initargs=(repo_dir,)
        ) as executor:
            commits = executor.map(_transform, commits, chunksize=64)
            commits = tqdm(commits, total=commits_num)
            commits = list(commits)
    else:
        with hglib.open(repo_dir) as hg:
            commits = [transform(hg, repo_dir, c) for c in commits]

    code_analysis_server.terminate()

    calculate_experiences(commits, first_pushdate, save)

    commits = [commit.to_dict() for commit in commits if not commit.ignored]

    if save:
        db.append(COMMITS_DB, commits)

    return commits
Exemple #14
0
def test_unregistered_db(tmp_path):
    db_path = tmp_path / "prova.json"

    with pytest.raises(AssertionError):
        list(db.read(db_path))

    with pytest.raises(AssertionError):
        db.write(db_path, range(7))

    with pytest.raises(AssertionError):
        db.append(db_path, range(7))
Exemple #15
0
def test_unregistered_db(tmp_path):
    db_path = tmp_path / "prova.json"

    with pytest.raises(AssertionError):
        list(db.read(db_path))

    with pytest.raises(AssertionError):
        db.write(db_path, range(7))

    with pytest.raises(AssertionError):
        db.append(db_path, range(7))
    def retrieve_issues(self) -> None:

        last_modified = None
        db.download(self.github.db_path)

        try:
            last_modified = db.last_modified(self.github.db_path)
        except db.LastModifiedNotAvailable:
            pass

        if last_modified:
            logger.info(
                f"Retrieving issues modified or created since the last run on {last_modified.isoformat()}"
            )
            data = self.github.fetch_issues_updated_since_timestamp(
                last_modified.isoformat())

            if self.retrieve_private:
                logger.info(
                    "Replacing contents of auto closed public issues with private issues content"
                )
                self.replace_with_private(data)

            updated_ids = set(issue["id"] for issue in data)

            logger.info(
                "Deleting issues that were changed since the last run and saving updates"
            )
            self.github.delete_issues(lambda issue: issue["id"] in updated_ids)

            db.append(self.github.db_path, data)
            logger.info("Updating finished")
        else:
            logger.info(
                "Retrieving all issues since last_modified is not available")
            self.github.download_issues()

            if self.retrieve_private:
                logger.info(
                    "Replacing contents of auto closed public issues with private issues content"
                )

                all_issues = list(self.github.get_issues())
                updated_issues, updated_ids = self.replace_with_private(
                    all_issues)

                logger.info(
                    "Deleting public issues that were updated and saving updates"
                )
                self.github.delete_issues(
                    lambda issue: issue["id"] in updated_ids)
                db.append(self.github.db_path, updated_issues)

        zstd_compress(self.github.db_path)
Exemple #17
0
def test_append_compressed(tmp_path):
    db_path = tmp_path / 'prova.json.gz'

    db.register(db_path, 'https://alink', 1)

    db.write(db_path, range(1, 4))

    assert list(db.read(db_path)) == [1, 2, 3]

    db.append(db_path, range(4, 8))

    assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
Exemple #18
0
def download_modified_revisions():
    try:
        last_modified = db.last_modified(REVISIONS_DB)
    except LastModifiedNotAvailable:
        return

    modified_revisions = get(modified_start=last_modified)
    modified_revision_ids = set(rev["id"] for rev in modified_revisions)

    db.delete(REVISIONS_DB,
              lambda revision: revision["id"] in modified_revision_ids)

    db.append(REVISIONS_DB, modified_revisions)
Exemple #19
0
def download_commits(repo_dir, rev_start=0, ret=False, save=True):
    hg = hglib.open(repo_dir)

    revs = get_revs(hg, rev_start)
    if len(revs) == 0:
        print("No commits to analyze")
        return []

    first_pushdate = hg_log(hg, [b"0"])[0].pushdate

    hg.close()

    print(f"Mining {len(revs)} commits using {os.cpu_count()} processes...")

    commits = hg_log_multi(repo_dir, revs)

    print("Downloading file->component mapping...")

    download_component_mapping()

    commits_to_ignore = get_commits_to_ignore(repo_dir, commits)
    print(f"{len(commits_to_ignore)} commits to ignore")

    calculate_experiences(commits, commits_to_ignore, first_pushdate, save)

    # Exclude commits to ignore.
    commits = [commit for commit in commits if commit not in commits_to_ignore]

    commits_num = len(commits)

    print(f"Mining {commits_num} commits using {os.cpu_count()} processes...")

    global rs_parsepatch
    import rs_parsepatch

    with concurrent.futures.ProcessPoolExecutor(
            initializer=_init, initargs=(repo_dir, )) as executor:
        commits = executor.map(_transform, commits, chunksize=64)
        commits = tqdm(commits, total=commits_num)

        if ret:
            commits = list(commits)

        if save:
            db.append(COMMITS_DB, commits)

        if ret:
            return commits
def go(days: int) -> None:
    logger.info("Download previous shadow scheduler statistics...")
    db.download(SHADOW_SCHEDULER_STATS_DB)

    logger.info("Get previously gathered statistics...")
    prev_scheduler_stat_revs = set(
        scheduler_stat["id"]
        for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB))
    logger.info(
        f"Already gathered statistics for {len(prev_scheduler_stat_revs)} pushes..."
    )

    to_date = datetime.utcnow() - relativedelta(days=3)
    from_date = to_date - relativedelta(days=days)
    pushes = mozci.push.make_push_objects(
        from_date=from_date.strftime("%Y-%m-%d"),
        to_date=to_date.strftime("%Y-%m-%d"),
        branch="autoland",
    )

    pushes = [
        push for push in pushes if push.rev not in prev_scheduler_stat_revs
    ]

    logger.info(f"{len(pushes)} left to analyze")

    def compress_and_upload() -> None:
        utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB)
        db.upload(SHADOW_SCHEDULER_STATS_DB)

    def results() -> Iterator[dict]:
        for i, push in enumerate(tqdm(pushes)):
            try:
                yield analyze_shadow_schedulers(push)
            except Exception:
                traceback.print_exc()

            # Upload every 42 pushes.
            if (i + 1) % 42 == 0:
                compress_and_upload()

    db.append(SHADOW_SCHEDULER_STATS_DB, results())
    compress_and_upload()
Exemple #21
0
def download_bugs(bug_ids, products=None, security=False):
    old_bug_count = 0
    new_bug_ids = set(int(bug_id) for bug_id in bug_ids)
    for bug in get_bugs(include_invalid=True):
        old_bug_count += 1
        if int(bug["id"]) in new_bug_ids:
            new_bug_ids.remove(bug["id"])

    print(f"Loaded {old_bug_count} bugs.")

    new_bug_ids = sorted(list(new_bug_ids))

    CHUNK_SIZE = 100

    chunks = (new_bug_ids[i:(i + CHUNK_SIZE)]
              for i in range(0, len(new_bug_ids), CHUNK_SIZE))

    @tenacity.retry(
        stop=tenacity.stop_after_attempt(7),
        wait=tenacity.wait_exponential(multiplier=1, min=16, max=64),
    )
    def get_chunk(chunk):
        new_bugs = get(chunk)

        if not security:
            new_bugs = [
                bug for bug in new_bugs.values() if len(bug["groups"]) == 0
            ]

        if products is not None:
            new_bugs = [
                bug for bug in new_bugs.values() if bug["product"] in products
            ]

        return new_bugs

    with tqdm(total=len(new_bug_ids)) as progress_bar:
        for chunk in chunks:
            new_bugs = get_chunk(chunk)

            progress_bar.update(len(chunk))

            db.append(BUGS_DB, new_bugs)
Exemple #22
0
def download_bugs(bug_ids: Iterable[int],
                  security: bool = False) -> List[BugDict]:
    old_bug_count = 0
    new_bug_ids_set = set(int(bug_id) for bug_id in bug_ids)
    for bug in get_bugs(include_invalid=True):
        old_bug_count += 1
        new_bug_ids_set.discard(int(bug["id"]))

    print(f"Loaded {old_bug_count} bugs.")

    new_bug_ids = sorted(list(new_bug_ids_set))

    chunks = (
        new_bug_ids[i:(i + Bugzilla.BUGZILLA_CHUNK_SIZE)]
        for i in range(0, len(new_bug_ids), Bugzilla.BUGZILLA_CHUNK_SIZE))

    @tenacity.retry(
        stop=tenacity.stop_after_attempt(7),
        wait=tenacity.wait_exponential(multiplier=1, min=16, max=64),
    )
    def get_chunk(chunk: List[int]) -> List[BugDict]:
        new_bugs = get(chunk)

        if not security:
            new_bugs = [
                bug for bug in new_bugs.values() if len(bug["groups"]) == 0
            ]

        return new_bugs

    all_new_bugs = []

    with tqdm(total=len(new_bug_ids)) as progress_bar:
        for chunk in chunks:
            new_bugs = get_chunk(chunk)

            progress_bar.update(len(chunk))

            db.append(BUGS_DB, new_bugs)

            all_new_bugs += new_bugs

    return all_new_bugs
Exemple #23
0
def download_bugs(bug_ids, products=None, security=False):
    old_bug_count = 0
    old_bugs = []
    new_bug_ids = set(int(bug_id) for bug_id in bug_ids)
    for bug in get_bugs():
        old_bug_count += 1
        if int(bug["id"]) in new_bug_ids:
            old_bugs.append(bug)
            new_bug_ids.remove(bug["id"])

    print(f"Loaded {old_bug_count} bugs.")

    new_bug_ids = sorted(list(new_bug_ids))

    CHUNK_SIZE = 100

    chunks = (
        new_bug_ids[i : (i + CHUNK_SIZE)]
        for i in range(0, len(new_bug_ids), CHUNK_SIZE)
    )
    with tqdm(total=len(new_bug_ids)) as progress_bar:
        for chunk in chunks:
            new_bugs = _download(chunk)

            progress_bar.update(len(chunk))

            if not security:
                new_bugs = {
                    bug_id: bug
                    for bug_id, bug in new_bugs.items()
                    if len(bug["groups"]) == 0
                }

            if products is not None:
                new_bugs = {
                    bug_id: bug
                    for bug_id, bug in new_bugs.items()
                    if bug["product"] in products
                }

            db.append(BUGS_DB, new_bugs.values())
Exemple #24
0
def download_revisions(rev_ids: Collection[int]) -> None:
    old_rev_count = 0
    new_rev_ids = set(int(rev_id) for rev_id in rev_ids)
    for rev in get_revisions():
        old_rev_count += 1
        if rev["id"] in new_rev_ids:
            new_rev_ids.remove(rev["id"])

    print(f"Loaded {old_rev_count} revisions.")

    new_rev_ids_list = sorted(list(new_rev_ids))
    rev_ids_groups = (new_rev_ids_list[i:i + 100]
                      for i in range(0, len(new_rev_ids_list), 100))

    with tqdm(total=len(new_rev_ids)) as progress_bar:
        for rev_ids_group in rev_ids_groups:
            revisions = get(rev_ids_group)

            progress_bar.update(len(rev_ids_group))

            db.append(REVISIONS_DB, revisions)
Exemple #25
0
    def generate_test_scheduling_history(self):
        if not os.path.exists("push_data.json"):
            download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
            zstd_decompress("push_data.json")
            assert os.path.exists(
                "push_data.json"), "Decompressed push data file exists"

        # Get the commits DB.
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS)

        with open("push_data.json", "r") as f:
            data = json.load(f)

        push_data = {}
        for row in data[1:]:
            # Revision -> (all tasks, possible regressions, likely regressions)
            push_data[row[0]] = (row[1], row[2], row[3])

        logger.info(f"push data nodes: {len(push_data)}")

        HISTORICAL_TIMESPAN = 56

        if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
            db.download(test_scheduling.TEST_SCHEDULING_DB,
                        support_files_too=True)

            for test_data in test_scheduling.get_test_scheduling_history():
                pass

            last_node = test_data["rev"]
        else:
            last_node = None

        past_failures = shelve.open(
            "data/past_failures.shelve",
            protocol=pickle.HIGHEST_PROTOCOL,
            writeback=True,
        )

        push_num = past_failures[
            "push_num"] if "push_num" in past_failures else 0

        def get_and_update_past_failures(type_, task, items, push_num,
                                         is_regression):
            values_total = []
            values_prev_7 = []
            values_prev_14 = []
            values_prev_28 = []
            values_prev_56 = []

            key = f"{type_}${task}$"

            for item in items:
                full_key = key + item

                if full_key not in past_failures:
                    cur = past_failures[full_key] = ExpQueue(
                        push_num, HISTORICAL_TIMESPAN + 1, 0)
                else:
                    cur = past_failures[full_key]

                value = cur[push_num]

                values_total.append(value)
                values_prev_7.append(value - cur[push_num - 7])
                values_prev_14.append(value - cur[push_num - 14])
                values_prev_28.append(value - cur[push_num - 28])
                values_prev_56.append(value - cur[push_num - 56])

                if is_regression:
                    cur[push_num] = value + 1

            return (
                sum(values_total),
                sum(values_prev_7),
                sum(values_prev_14),
                sum(values_prev_28),
                sum(values_prev_56),
            )

        def generate_data():
            nonlocal push_num
            commits_with_data = set()
            saved_nodes = set()

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False
            for commit_data in tqdm(repository.get_commits()):
                node = commit_data["node"]

                # Sync DB every 1000 commits, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if len(commits_with_data) % 1000 == 0:
                    past_failures.sync()

                if node == last_node:
                    can_start = True
                    continue

                if not can_start:
                    continue

                if node not in push_data:
                    continue

                commits_with_data.add(node)

                commit_push_data = push_data[node]

                for task in commit_push_data[0]:
                    if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
                        continue

                    is_regression = (task in commit_push_data[1]
                                     or task in commit_push_data[2])

                    (
                        total_failures,
                        past_7_pushes_failures,
                        past_14_pushes_failures,
                        past_28_pushes_failures,
                        past_56_pushes_failures,
                    ) = get_and_update_past_failures("all", task, ["all"],
                                                     push_num, is_regression)

                    (
                        total_types_failures,
                        past_7_pushes_types_failures,
                        past_14_pushes_types_failures,
                        past_28_pushes_types_failures,
                        past_56_pushes_types_failures,
                    ) = get_and_update_past_failures("type", task,
                                                     commit_data["types"],
                                                     push_num, is_regression)

                    (
                        total_files_failures,
                        past_7_pushes_files_failures,
                        past_14_pushes_files_failures,
                        past_28_pushes_files_failures,
                        past_56_pushes_files_failures,
                    ) = get_and_update_past_failures("file", task,
                                                     commit_data["files"],
                                                     push_num, is_regression)

                    (
                        total_directories_failures,
                        past_7_pushes_directories_failures,
                        past_14_pushes_directories_failures,
                        past_28_pushes_directories_failures,
                        past_56_pushes_directories_failures,
                    ) = get_and_update_past_failures(
                        "directory",
                        task,
                        commit_data["directories"],
                        push_num,
                        is_regression,
                    )

                    (
                        total_components_failures,
                        past_7_pushes_components_failures,
                        past_14_pushes_components_failures,
                        past_28_pushes_components_failures,
                        past_56_pushes_components_failures,
                    ) = get_and_update_past_failures(
                        "component",
                        task,
                        commit_data["components"],
                        push_num,
                        is_regression,
                    )

                    pushdate = dateutil.parser.parse(commit_data["pushdate"])
                    if pushdate > HISTORY_DATE_START:
                        saved_nodes.add(node)

                        yield {
                            "rev": node,
                            "name": task,
                            "failures": total_failures,
                            "failures_past_7_pushes": past_7_pushes_failures,
                            "failures_past_14_pushes": past_14_pushes_failures,
                            "failures_past_28_pushes": past_28_pushes_failures,
                            "failures_past_56_pushes": past_56_pushes_failures,
                            "failures_in_types": total_types_failures,
                            "failures_past_7_pushes_in_types":
                            past_7_pushes_types_failures,
                            "failures_past_14_pushes_in_types":
                            past_14_pushes_types_failures,
                            "failures_past_28_pushes_in_types":
                            past_28_pushes_types_failures,
                            "failures_past_56_pushes_in_types":
                            past_56_pushes_types_failures,
                            "failures_in_files": total_files_failures,
                            "failures_past_7_pushes_in_files":
                            past_7_pushes_files_failures,
                            "failures_past_14_pushes_in_files":
                            past_14_pushes_files_failures,
                            "failures_past_28_pushes_in_files":
                            past_28_pushes_files_failures,
                            "failures_past_56_pushes_in_files":
                            past_56_pushes_files_failures,
                            "failures_in_directories":
                            total_directories_failures,
                            "failures_past_7_pushes_in_directories":
                            past_7_pushes_directories_failures,
                            "failures_past_14_pushes_in_directories":
                            past_14_pushes_directories_failures,
                            "failures_past_28_pushes_in_directories":
                            past_28_pushes_directories_failures,
                            "failures_past_56_pushes_in_directories":
                            past_56_pushes_directories_failures,
                            "failures_in_components":
                            total_components_failures,
                            "failures_past_7_pushes_in_components":
                            past_7_pushes_components_failures,
                            "failures_past_14_pushes_in_components":
                            past_14_pushes_components_failures,
                            "failures_past_28_pushes_in_components":
                            past_28_pushes_components_failures,
                            "failures_past_56_pushes_in_components":
                            past_56_pushes_components_failures,
                            "is_possible_regression": task
                            in commit_push_data[1],
                            "is_likely_regression": task
                            in commit_push_data[2],
                        }

                # We no longer need the push data for this node, we can free the memory.
                del push_data[node]

                push_num += 1

            logger.info(
                f"commits linked to push data: {len(commits_with_data)}")

            logger.info(f"saved push data nodes: {len(saved_nodes)}")

        db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())

        zstd_compress(test_scheduling.TEST_SCHEDULING_DB)

        past_failures["push_num"] = push_num
        past_failures.close()
        zstd_compress("data/past_failures.shelve")
Exemple #26
0
    def find_bug_introducing_commits(
        self, bug_fixing_commits, commits_to_ignore, tokenized
    ):
        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.tokenized_git_repo_dir
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.git_repo_dir

        def git_to_mercurial(rev):
            if tokenized:
                return self.tokenized_git_to_mercurial[rev]
            else:
                return vcs_map.git_to_mercurial(rev)

        def mercurial_to_git(rev):
            if tokenized:
                return self.mercurial_to_tokenized_git[rev]
            else:
                return vcs_map.mercurial_to_git(rev)

        logger.info("Download previously found bug-introducing commits...")
        if db.is_old_version(db_path) or not db.exists(db_path):
            db.download(db_path, force=True)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits
        )
        logger.info(
            f"Already classified {len(prev_bug_introducing_commits)} commits..."
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            f.writelines(
                "{}\n".format(mercurial_to_git(commit["rev"]))
                for commit in commits_to_ignore
                if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git
            )

        logger.info(f"{len(bug_fixing_commits)} commits to analyze")

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit
            for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
        )

        bug_fixing_commits = [
            bug_fixing_commit
            for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit
                for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash"
            )

        # Analyze up to 500 commits at a time, to avoid the task running out of time.
        done = True
        if len(bug_fixing_commits) > 500:
            bug_fixing_commits = bug_fixing_commits[-500:]
            done = False

        with open("done", "w") as f:
            f.write(str(1 if done else 0))

        def _init(git_repo_dir):
            global GIT_REPO
            GIT_REPO = GitRepository(git_repo_dir)

        def find_bic(bug_fixing_commit):
            logger.info("Analyzing {}...".format(bug_fixing_commit["rev"]))

            git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"])

            commit = GIT_REPO.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                return [None]

            bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines(
                commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")
            )
            logger.info(
                "Found {} for {}".format(
                    bug_introducing_modifications, bug_fixing_commit["rev"]
                )
            )

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values():
                for bug_introducing_hash in bug_introducing_hashes:
                    try:
                        bug_introducing_commits.append(
                            {
                                "bug_fixing_rev": bug_fixing_commit["rev"],
                                "bug_introducing_rev": git_to_mercurial(
                                    bug_introducing_hash
                                ),
                            }
                        )
                    except Exception as e:
                        # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
                        if not str(e).startswith("Missing git commit in the VCS map"):
                            raise

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append(
                    {
                        "bug_fixing_rev": bug_fixing_commit["rev"],
                        "bug_introducing_rev": "",
                    }
                )

            return bug_introducing_commits

        with concurrent.futures.ThreadPoolExecutor(
            initializer=_init, initargs=(repo_dir,), max_workers=os.cpu_count() + 1
        ) as executor:
            bug_introducing_commits = executor.map(find_bic, bug_fixing_commits)
            bug_introducing_commits = tqdm(
                bug_introducing_commits, total=len(bug_fixing_commits)
            )
            bug_introducing_commits = list(
                itertools.chain.from_iterable(bug_introducing_commits)
            )

        total_results_num = len(bug_introducing_commits)
        bug_introducing_commits = list(filter(None, bug_introducing_commits))
        logger.info(
            f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
        )

        db.append(db_path, bug_introducing_commits)
        compress_file(db_path)
Exemple #27
0
def download_commits(
    repo_dir: str,
    rev_start: str = None,
    revs: List[bytes] = None,
    save: bool = True,
    use_single_process: bool = False,
    include_no_bug: bool = False,
    include_backouts: bool = False,
    include_ignored: bool = False,
) -> Tuple[CommitDict, ...]:
    assert revs is not None or rev_start is not None

    with hglib.open(repo_dir) as hg:
        if revs is None:
            revs = get_revs(hg, rev_start)

        if len(revs) == 0:
            logger.info("No commits to analyze")
            return tuple()

        first_pushdate = get_first_pushdate(repo_dir)

        logger.info(f"Mining {len(revs)} commits...")

        if not use_single_process:
            logger.info(f"Using {os.cpu_count()} processes...")
            commits = hg_log_multi(repo_dir, revs)
        else:
            commits = hg_log(hg, revs)

        if save or not os.path.exists("data/component_mapping.lmdb"):
            logger.info("Downloading file->component mapping...")
            download_component_mapping()

        set_commits_to_ignore(hg, repo_dir, commits)

        commits_num = len(commits)

        logger.info(f"Mining {commits_num} patches...")

        global code_analysis_server
        code_analysis_server = rust_code_analysis_server.RustCodeAnalysisServer(
        )

        if not use_single_process:
            with concurrent.futures.ProcessPoolExecutor(
                    initializer=_init_process,
                    initargs=(repo_dir, )) as executor:
                commits = executor.map(_transform, commits, chunksize=64)
                commits = tqdm(commits, total=commits_num)
                commits = tuple(commits)
        else:
            get_component_mapping()

            commits = tuple(transform(hg, repo_dir, c) for c in commits)

            close_component_mapping()

    code_analysis_server.terminate()

    calculate_experiences(commits, first_pushdate, save)

    logger.info("Applying final commits filtering...")

    commits = tuple(commit.to_dict() for commit in commits)

    if save:
        db.append(COMMITS_DB, commits)

    return tuple(
        filter_commits(
            commits,
            include_no_bug=include_no_bug,
            include_backouts=include_backouts,
            include_ignored=include_ignored,
        ))
Exemple #28
0
    def find_bug_introducing_commits(self, repo_dir, tokenized):
        from pydriller import GitRepository
        from pydriller.domain.commit import ModificationType

        logger.info("Download commits to ignore...")
        assert db.download(IGNORED_COMMITS_DB)
        commits_to_ignore = list(db.read(IGNORED_COMMITS_DB))

        logger.info("Download bug-fixing classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)
        bug_fixing_commits = [
            bug_fixing_commit
            for bug_fixing_commit in db.read(BUG_FIXING_COMMITS_DB)
            if bug_fixing_commit["type"] in ["r", "d"]
        ]

        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB

        def git_to_mercurial(rev):
            if tokenized:
                return self.tokenized_git_to_mercurial[rev]
            else:
                return vcs_map.git_to_mercurial(repo_dir, rev)

        def mercurial_to_git(rev):
            if tokenized:
                return self.mercurial_to_tokenized_git[rev]
            else:
                return vcs_map.mercurial_to_git(repo_dir, rev)

        logger.info("Download previously found bug-introducing commits...")
        db.download(db_path)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits)
        logger.info(
            f"Already classified {len(prev_bug_introducing_commits)} commits..."
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            f.writelines("{}\n".format(mercurial_to_git(commit["rev"]))
                         for commit in commits_to_ignore if not tokenized
                         or commit["rev"] in self.mercurial_to_tokenized_git)

        logger.info(f"{len(bug_fixing_commits)} commits to analyze")

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if
            bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
        )

        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash"
            )

        git_init_lock = threading.Lock()

        def _init(git_repo_dir):
            with git_init_lock:
                thread_local.git = GitRepository(git_repo_dir)
                # Call get_head in order to make pydriller initialize the repository.
                thread_local.git.get_head()

        def find_bic(bug_fixing_commit):
            logger.info("Analyzing {}...".format(bug_fixing_commit["rev"]))

            git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"])

            commit = thread_local.git.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                logger.info("Skipping {} as it is too big".format(
                    bug_fixing_commit["rev"]))
                return None

            def get_modification_path(mod):
                path = mod.new_path
                if (mod.change_type == ModificationType.RENAME
                        or mod.change_type == ModificationType.DELETE):
                    path = mod.old_path
                return path

            bug_introducing_modifications = {}
            for modification in commit.modifications:
                if (get_modification_path(modification) ==
                        "testing/web-platform/meta/MANIFEST.json"):
                    continue

                bug_introducing_modifications.update(
                    thread_local.git.get_commits_last_modified_lines(
                        commit,
                        modification=modification,
                        hashes_to_ignore_path=os.path.realpath(
                            "git_hashes_to_ignore"),
                    ))

            logger.info("Found {} for {}".format(bug_introducing_modifications,
                                                 bug_fixing_commit["rev"]))

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values(
            ):
                for bug_introducing_hash in bug_introducing_hashes:
                    try:
                        bug_introducing_commits.append({
                            "bug_fixing_rev":
                            bug_fixing_commit["rev"],
                            "bug_introducing_rev":
                            git_to_mercurial(bug_introducing_hash),
                        })
                    except Exception as e:
                        # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
                        if not str(e).startswith(
                                "Missing git commit in the VCS map"):
                            raise

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append({
                    "bug_fixing_rev":
                    bug_fixing_commit["rev"],
                    "bug_introducing_rev":
                    "",
                })

            return bug_introducing_commits

        def compress_and_upload():
            zstd_compress(db_path)
            db.upload(db_path)

        workers = os.cpu_count() + 1
        logger.info(
            f"Analyzing {len(bug_fixing_commits)} commits using {workers} workers..."
        )

        with concurrent.futures.ThreadPoolExecutor(
                initializer=_init, initargs=(repo_dir, ),
                max_workers=workers) as executor:

            def results():
                start_time = time.monotonic()

                futures = {
                    executor.submit(find_bic, bug_fixing_commit):
                    bug_fixing_commit["rev"]
                    for bug_fixing_commit in bug_fixing_commits
                }

                for future in tqdm(
                        concurrent.futures.as_completed(futures),
                        total=len(futures),
                ):
                    exc = future.exception()
                    if exc is not None:
                        logger.info(
                            f"Exception {exc} while analyzing {futures[future]}"
                        )
                        for f in futures:
                            f.cancel()

                    result = future.result()
                    if result is not None:
                        yield from result

                    if time.monotonic() - start_time >= 3600:
                        compress_and_upload()
                        start_time = time.monotonic()

            db.append(db_path, results())

        compress_and_upload()
    def find_bug_fixing_commits(self):
        logger.info("Downloading commits database...")
        if db.is_old_version(
                repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB):
            db.download(repository.COMMITS_DB, force=True)

        logger.info("Downloading bugs database...")
        if db.is_old_version(
                bugzilla.BUGS_DB) or not db.exists(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB, force=True)

        logger.info("Download previous classifications...")
        if db.is_old_version(
                BUG_FIXING_COMMITS_DB) or not db.exists(BUG_FIXING_COMMITS_DB):
            db.download(BUG_FIXING_COMMITS_DB, force=True)

        logger.info("Get previously classified commits...")
        prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))
        prev_bug_fixing_commits_nodes = set(
            bug_fixing_commit["rev"]
            for bug_fixing_commit in prev_bug_fixing_commits)
        logger.info(
            f"Already classified {len(prev_bug_fixing_commits)} commits...")

        # TODO: Switch to the pure Defect model, as it's better in this case.
        logger.info("Downloading defect/enhancement/task model...")
        download_model("defectenhancementtask")
        defect_model = DefectEnhancementTaskModel.load(
            "defectenhancementtaskmodel")

        logger.info("Downloading regression model...")
        download_model("regression")
        regression_model = RegressionModel.load("regressionmodel")

        start_date = datetime.now() - RELATIVE_START_DATE
        end_date = datetime.now() - RELATIVE_END_DATE
        logger.info(
            f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
        )
        commit_map = defaultdict(list)
        for commit in repository.get_commits():
            if commit["node"] in prev_bug_fixing_commits_nodes:
                continue

            commit_date = dateutil.parser.parse(commit["pushdate"])
            if commit_date < start_date or commit_date > end_date:
                continue

            commit_map[commit["bug_id"]].append(commit["node"])

        logger.info(
            f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits"
        )
        assert len(commit_map) > 0

        def get_relevant_bugs():
            return (bug for bug in bugzilla.get_bugs()
                    if bug["id"] in commit_map)

        bug_count = sum(1 for bug in get_relevant_bugs())
        logger.info(
            f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing"
        )

        known_defect_labels = defect_model.get_labels()
        known_regression_labels = regression_model.get_labels()

        bug_fixing_commits = []

        def append_bug_fixing_commits(bug_id, type_):
            for commit in commit_map[bug_id]:
                bug_fixing_commits.append({"rev": commit, "type": type_})

        for bug in tqdm(get_relevant_bugs(), total=bug_count):
            # Ignore bugs which are not linked to the commits we care about.
            if bug["id"] not in commit_map:
                continue

            # If we know the label already, we don't need to apply the model.
            if (bug["id"] in known_regression_labels
                    and known_regression_labels[bug["id"]] == 1):
                append_bug_fixing_commits(bug["id"], "r")
                continue

            if bug["id"] in known_defect_labels:
                if known_defect_labels[bug["id"]] == "defect":
                    append_bug_fixing_commits(bug["id"], "d")
                else:
                    append_bug_fixing_commits(bug["id"], "e")
                continue

            if defect_model.classify(bug)[0] == "defect":
                if regression_model.classify(bug)[0] == 1:
                    append_bug_fixing_commits(bug["id"], "r")
                else:
                    append_bug_fixing_commits(bug["id"], "d")
            else:
                append_bug_fixing_commits(bug["id"], "e")

        db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
        zstd_compress(BUG_FIXING_COMMITS_DB)

        bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits
        return [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["type"] in ["r", "d"]
        ]
    def find_bug_introducing_commits(self, bug_fixing_commits,
                                     commits_to_ignore, tokenized):
        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.tokenized_git_repo_dir
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB
            repo_dir = self.git_repo_dir

        def git_to_mercurial(rev):
            if tokenized:
                return self.tokenized_git_to_mercurial[rev]
            else:
                return vcs_map.git_to_mercurial(rev)

        def mercurial_to_git(rev):
            if tokenized:
                return self.mercurial_to_tokenized_git[rev]
            else:
                return vcs_map.mercurial_to_git(rev)

        logger.info("Download previously found bug-introducing commits...")
        if db.is_old_version(db_path) or not db.exists(db_path):
            db.download(db_path, force=True)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits)
        logger.info(
            f"Already classified {len(prev_bug_introducing_commits)} commits..."
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            f.writelines("{}\n".format(mercurial_to_git(commit["rev"]))
                         for commit in commits_to_ignore if not tokenized
                         or commit["rev"] in self.mercurial_to_tokenized_git)

        logger.info(f"{len(bug_fixing_commits)} commits to analyze")

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if
            bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
        )

        bug_fixing_commits = [
            bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash"
            )

        def _init(git_repo_dir):
            thread_local.git = GitRepository(git_repo_dir)

        def find_bic(bug_fixing_commit):
            logger.info("Analyzing {}...".format(bug_fixing_commit["rev"]))

            git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"])

            commit = thread_local.git.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                logger.info("Skipping {} as it is too big".format(
                    bug_fixing_commit["rev"]))
                return None

            bug_introducing_modifications = thread_local.git.get_commits_last_modified_lines(
                commit,
                hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore"))

            logger.info("Found {} for {}".format(bug_introducing_modifications,
                                                 bug_fixing_commit["rev"]))

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values(
            ):
                for bug_introducing_hash in bug_introducing_hashes:
                    try:
                        bug_introducing_commits.append({
                            "bug_fixing_rev":
                            bug_fixing_commit["rev"],
                            "bug_introducing_rev":
                            git_to_mercurial(bug_introducing_hash),
                        })
                    except Exception as e:
                        # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
                        if not str(e).startswith(
                                "Missing git commit in the VCS map"):
                            raise

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append({
                    "bug_fixing_rev":
                    bug_fixing_commit["rev"],
                    "bug_introducing_rev":
                    "",
                })

            return bug_introducing_commits

        with concurrent.futures.ThreadPoolExecutor(initializer=_init,
                                                   initargs=(repo_dir, ),
                                                   max_workers=os.cpu_count() +
                                                   1) as executor:

            def results():
                num_analyzed = 0

                bug_fixing_commits_queue = bug_fixing_commits.copy()

                # Analyze up to 500 commits at a time, to avoid the task running out of time.
                while len(
                        bug_fixing_commits_queue) != 0 and num_analyzed != 500:
                    bug_introducing_commit_futures = []
                    for _ in range(
                            min(500 - num_analyzed, len(bug_fixing_commits))):
                        bug_introducing_commit_futures.append(
                            executor.submit(find_bic,
                                            bug_fixing_commits.pop()))

                    logger.info(
                        f"Analyzing a chunk of {len(bug_introducing_commit_futures)} commits"
                    )

                    for future in tqdm(
                            concurrent.futures.as_completed(
                                bug_introducing_commit_futures),
                            total=len(bug_introducing_commit_futures),
                    ):
                        result = future.result()
                        if result is not None:
                            num_analyzed += 1
                            yield from result

                with open("done", "w") as f:
                    f.write(
                        str(1 if len(bug_fixing_commits_queue) == 0 else 0))

            db.append(db_path, results())

        zstd_compress(db_path)
Exemple #31
0
def download_bugs_between(date_from, date_to, security=False, store=True):
    products = {
        "Add-on SDK",
        "Android Background Services",
        "Core",
        "Core Graveyard",
        "DevTools",
        "DevTools Graveyard",
        "External Software Affecting Firefox",
        "Firefox",
        "Firefox Graveyard",
        "Firefox Build System",
        "Firefox for Android",
        "Firefox for Android Graveyard",
        # 'Firefox for iOS',
        "Firefox Health Report",
        # 'Focus',
        # 'Hello (Loop)',
        "NSPR",
        "NSS",
        "Toolkit",
        "Toolkit Graveyard",
        "WebExtensions",
    }

    params = {
        "f1": "creation_ts",
        "o1": "greaterthan",
        "v1": date_from.strftime("%Y-%m-%d"),
        "f2": "creation_ts",
        "o2": "lessthan",
        "v2": date_to.strftime("%Y-%m-%d"),
        "product": products,
    }

    if not security:
        params["f3"] = "bug_group"
        params["o3"] = "isempty"

    params["count_only"] = 1
    r = requests.get("https://bugzilla.mozilla.org/rest/bug", params=params)
    r.raise_for_status()
    count = r.json()["bug_count"]
    del params["count_only"]

    params["limit"] = 100
    params["order"] = "bug_id"

    old_bug_ids = set(bug["id"] for bug in get_bugs())

    all_bugs = []

    with tqdm(total=count) as progress_bar:
        for offset in range(0, count, Bugzilla.BUGZILLA_CHUNK_SIZE):
            params["offset"] = offset

            new_bugs = _download(params)

            progress_bar.update(Bugzilla.BUGZILLA_CHUNK_SIZE)

            all_bugs += [bug for bug in new_bugs.values()]

            if store:
                db.append(
                    BUGS_DB,
                    (bug for bug_id, bug in new_bugs.items()
                     if bug_id not in old_bug_ids),
                )

    return all_bugs
    def generate_test_scheduling_history(self, granularity):
        push_data_path = f"push_data_{granularity}.json"
        updated = download_check_etag(
            test_scheduling.PUSH_DATA_URL.format(granularity=granularity)
        )
        if updated:
            zstd_decompress(push_data_path)
            os.remove(f"{push_data_path}.zst")
        assert os.path.exists(push_data_path), "Decompressed push data file exists"

        # Get the commits DB.
        assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=TRAINING_MONTHS[granularity]
        )

        if granularity == "label":
            test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_LABEL_DB
            )
            failing_together_db = os.path.join(
                "data", test_scheduling.FAILING_TOGETHER_LABEL_DB
            )
        elif granularity == "group":
            test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_GROUP_DB
            )
            touched_together_db = os.path.join(
                "data", test_scheduling.TOUCHED_TOGETHER_DB
            )

        db.download(test_scheduling_db, support_files_too=True)

        last_node = None
        for revs, _ in test_scheduling.get_test_scheduling_history(granularity):
            last_node = revs[0]

        def generate_failing_together_probabilities(push_data):
            # TODO: we should consider the probabilities of `task1 failure -> task2 failure` and
            # `task2 failure -> task1 failure` separately, as they could be different.

            count_runs = collections.Counter()
            count_single_failures = collections.Counter()
            count_both_failures = collections.Counter()

            for revisions, tasks, likely_regressions, candidate_regressions in tqdm(
                push_data
            ):
                failures = set(likely_regressions + candidate_regressions)
                all_tasks = list(set(tasks) | failures)

                for task1, task2 in itertools.combinations(sorted(all_tasks), 2):
                    count_runs[(task1, task2)] += 1

                    if task1 in failures:
                        if task2 in failures:
                            count_both_failures[(task1, task2)] += 1
                        else:
                            count_single_failures[(task1, task2)] += 1
                    elif task2 in failures:
                        count_single_failures[(task1, task2)] += 1

            stats = {}

            skipped = 0

            for couple, run_count in count_runs.most_common():
                failure_count = count_both_failures[couple]
                support = failure_count / run_count

                if support < 1 / 700:
                    skipped += 1
                    continue

                if failure_count != 0:
                    confidence = failure_count / (
                        count_single_failures[couple] + failure_count
                    )
                else:
                    confidence = 0.0

                stats[couple] = (support, confidence)

            logger.info(f"{skipped} couples skipped because their support was too low")

            logger.info("Redundancies with the highest support and confidence:")
            for couple, (support, confidence) in sorted(
                stats.items(), key=lambda k: (-k[1][1], -k[1][0])
            )[:7]:
                failure_count = count_both_failures[couple]
                run_count = count_runs[couple]
                logger.info(
                    f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})."
                )

            logger.info("Redundancies with the highest confidence and lowest support:")
            for couple, (support, confidence) in sorted(
                stats.items(), key=lambda k: (-k[1][1], k[1][0])
            )[:7]:
                failure_count = count_both_failures[couple]
                run_count = count_runs[couple]
                logger.info(
                    f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})."
                )

            failing_together = test_scheduling.get_failing_together_db()
            count_redundancies = collections.Counter()
            for couple, (support, confidence) in stats.items():
                if confidence == 1.0:
                    count_redundancies["==100%"] += 1
                if confidence > 0.9:
                    count_redundancies[">=90%"] += 1
                if confidence > 0.8:
                    count_redundancies[">=80%"] += 1
                if confidence > 0.7:
                    count_redundancies[">=70%"] += 1

                if confidence < 0.7:
                    continue

                failing_together[
                    f"{couple[0]}${couple[1]}".encode("utf-8")
                ] = struct.pack("ff", support, confidence)

            for percentage, count in count_redundancies.most_common():
                logger.info(f"{count} with {percentage} confidence")

            test_scheduling.close_failing_together_db()

        def generate_all_data():
            past_failures = test_scheduling.get_past_failures(granularity)

            push_num = past_failures["push_num"] if "push_num" in past_failures else 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                if not can_start:
                    if last_node == commit_data["node"]:
                        can_start = True

                    continue

                commit_map[commit_data["node"]] = commit_data

            with open(push_data_path, "r") as f:
                push_data = json.load(f)

            logger.info(f"push data nodes: {len(push_data)}")

            if granularity == "label":
                push_data = [
                    (
                        revisions,
                        rename_tasks(push_tasks),
                        rename_tasks(possible_regressions),
                        rename_tasks(likely_regressions),
                    )
                    for revisions, push_tasks, possible_regressions, likely_regressions in push_data
                ]

            # In the last 28 pushes, we definitely run all possible runnables.
            all_runnables_set = set(
                sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), [])
            )
            # Filter runnables we don't need.
            all_runnables = filter_runnables(
                list(all_runnables_set), all_runnables_set, granularity
            )
            all_runnables_set = set(all_runnables_set)
            logger.info(f"{len(all_runnables_set)} runnables run in the last 28 pushes")

            push_data = [
                (
                    revisions,
                    filter_runnables(push_tasks, all_runnables_set, granularity),
                    filter_runnables(
                        possible_regressions, all_runnables_set, granularity
                    ),
                    filter_runnables(
                        likely_regressions, all_runnables_set, granularity
                    ),
                )
                for revisions, push_tasks, possible_regressions, likely_regressions in push_data
            ]

            if granularity == "label":
                generate_failing_together_probabilities(push_data)

            # Store all runnables in the past_failures DB so it can be used in the evaluation phase.
            past_failures["all_runnables"] = all_runnables
            # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_runnables = 0

            # We can start once we get to the last revision we added in the previous run.
            can_start = True if last_node is None else False

            if granularity == "group":
                update_touched_together_gen = test_scheduling.update_touched_together()
                next(update_touched_together_gen)

            for i in tqdm(range(len(push_data))):
                (
                    revisions,
                    push_runnables,
                    possible_regressions,
                    likely_regressions,
                ) = push_data.pop(0)

                if not can_start:
                    if last_node == revisions[0]:
                        can_start = True

                    continue

                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision)
                    for revision in revisions
                    if revision in commit_map
                )
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 50:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_runnables, we'd generate a huge amount of data.
                # We consider only the runnables which run in this push, and the possible and likely regressions
                # from this push. We can't consider all runnables because we can't be sure that a task that didn't
                # run on a push would have been successful.
                runnables_to_consider = list(
                    set(push_runnables + possible_regressions + likely_regressions)
                )

                if len(runnables_to_consider) == 0:
                    skipped_no_runnables += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                if granularity == "group":
                    update_touched_together_gen.send(commits[0]["node"])

                result = {
                    "revs": revisions,
                    "data": [],
                }
                for data in test_scheduling.generate_data(
                    past_failures,
                    merged_commits,
                    push_num,
                    runnables_to_consider,
                    possible_regressions,
                    likely_regressions,
                ):
                    if pushdate > HISTORY_DATE_START:
                        result["data"].append(data)

                if pushdate > HISTORY_DATE_START:
                    saved_nodes.add(i)
                    yield result

            if granularity == "group":
                try:
                    update_touched_together_gen.send(None)
                except StopIteration:
                    pass

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)")

            past_failures["push_num"] = push_num
            past_failures.close()

        db.append(test_scheduling_db, generate_all_data())

        zstd_compress(test_scheduling_db)

        with open_tar_zst(past_failures_db) as tar:
            tar.add(past_failures_db[: -len(".tar.zst")])

        if granularity == "group":
            with open_tar_zst(touched_together_db) as tar:
                tar.add(touched_together_db[: -len(".tar.zst")])

        if granularity == "label":
            with open_tar_zst(failing_together_db) as tar:
                tar.add(failing_together_db[: -len(".tar.zst")])
    def generate_test_scheduling_history(self, granularity: str,
                                         training_months: int) -> None:
        if granularity != "config_group":
            # Get the commits DB.
            assert db.download(repository.COMMITS_DB)

        HISTORY_DATE_START = datetime.now() - relativedelta(
            months=training_months)

        if granularity == "label":
            test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_LABEL_DB)
            failing_together_db = os.path.join(
                "data", test_scheduling.FAILING_TOGETHER_LABEL_DB)
        elif granularity == "group":
            test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_GROUP_DB)
            touched_together_db = os.path.join(
                "data", test_scheduling.TOUCHED_TOGETHER_DB)
        elif granularity == "config_group":
            test_scheduling_db = test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB
            past_failures_db = os.path.join(
                "data", test_scheduling.PAST_FAILURES_CONFIG_GROUP_DB)
            failing_together_db = os.path.join(
                "data", test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB)

        push_data_iter, push_data_count, all_runnables = test_scheduling.get_push_data(
            granularity)

        if granularity in ("label", "config_group"):
            test_scheduling.generate_failing_together_probabilities(
                granularity, push_data_iter(), push_data_count)

        def generate_all_data() -> Generator[Dict[str, Any], None, None]:
            past_failures = test_scheduling.get_past_failures(
                granularity, False)

            push_num = past_failures[
                "push_num"] if "push_num" in past_failures else 0

            commit_map = {}
            for commit_data in tqdm(repository.get_commits()):
                commit_map[commit_data["node"]] = commit_data

            # Store all runnables in the past_failures DB so it can be used in the evaluation phase.
            past_failures["all_runnables"] = all_runnables
            # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the
            # same as the current ones?

            saved_nodes = set()
            skipped_no_commits = 0
            skipped_too_big_commits = 0
            skipped_no_runnables = 0

            if granularity in ("group", "config_group"):
                update_touched_together_gen = test_scheduling.update_touched_together(
                )
                next(update_touched_together_gen)

            for (
                    i,
                (
                    revisions,
                    fix_revision,
                    push_runnables,
                    possible_regressions,
                    likely_regressions,
                ),
            ) in enumerate(tqdm(push_data_iter(), total=push_data_count)):
                push_num += 1

                # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
                commits = tuple(
                    commit_map.pop(revision) for revision in revisions
                    if revision in commit_map)
                if len(commits) == 0:
                    skipped_no_commits += 1
                    continue

                # Skip wptsync commits, since they are not like normal pushes made by developers.
                if any(repository.is_wptsync(commit) for commit in commits):
                    continue

                merged_commits = commit_features.merge_commits(commits)

                # XXX: For now, skip commits which are too large.
                # In the future we can either:
                #  - Improve shelve perf and go back to consider all files;
                #  - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
                #  - Keep a limit of number of files.
                if len(merged_commits["files"]) > 50:
                    skipped_too_big_commits += 1
                    continue

                # If we considered all_runnables, we'd generate a huge amount of data.
                # We consider only the runnables which run in this push, and the possible and likely regressions
                # from this push. We can't consider all runnables because we can't be sure that a task that didn't
                # run on a push would have been successful.
                runnables_to_consider = list(
                    set(push_runnables + possible_regressions +
                        likely_regressions))

                if len(runnables_to_consider) == 0:
                    skipped_no_runnables += 1
                    continue

                # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
                if i % 250 == 0:
                    past_failures.sync()

                pushdate = dateutil.parser.parse(merged_commits["pushdate"])

                if granularity in ("group", "config_group"):
                    update_touched_together_gen.send(commits[0]["node"])

                result_data = []
                for data in test_scheduling.generate_data(
                        granularity,
                        past_failures,
                        merged_commits,
                        push_num,
                        runnables_to_consider,
                        possible_regressions,
                        likely_regressions,
                ):
                    if pushdate > HISTORY_DATE_START:
                        result_data.append(data)

                if pushdate > HISTORY_DATE_START:
                    saved_nodes.add(i)
                    yield {
                        "revs": revisions,
                        "data": result_data,
                    }

            if granularity == "group":
                try:
                    update_touched_together_gen.send(None)
                except StopIteration:
                    pass

            logger.info(f"saved push data nodes: {len(saved_nodes)}")
            logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
            logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
            logger.info(
                f"skipped {skipped_no_runnables} (no interesting runnables)")

            past_failures["push_num"] = push_num
            past_failures.close()

        # For the config/group granularity, we are only interested in the failing together DB.
        if granularity != "config_group":
            db.append(test_scheduling_db, generate_all_data())

            zstd_compress(test_scheduling_db)
            create_tar_zst(past_failures_db)

        if granularity == "group":
            create_tar_zst(touched_together_db)

        if granularity in ("label", "config_group"):
            create_tar_zst(failing_together_db)
Exemple #34
0
    def generate_push_data(
        self, granularity: str, training_months: int, reretrieve: int
    ) -> None:
        # We'll use the past training_months months only for training the model,
        # but we use half training_months months more than that to calculate the
        # failure statistics.
        from_months = training_months + math.floor(training_months / 2)

        # We use the actual date instead of 'today-X' aliases to avoid mozci caching
        # this query.
        from_date = datetime.utcnow() - relativedelta(months=from_months)
        to_date = datetime.utcnow() - relativedelta(days=3)

        if granularity == "label":
            push_data_db = test_scheduling.PUSH_DATA_LABEL_DB
        elif granularity == "group":
            push_data_db = test_scheduling.PUSH_DATA_GROUP_DB
        elif granularity == "config_group":
            push_data_db = test_scheduling.PUSH_DATA_CONFIG_GROUP_DB

        def cache_key(push: mozci.push.Push) -> str:
            return f"push_data.{granularity}.{push.rev}"

        def generate(
            progress_bar: tqdm,
            pushes: List[mozci.push.Push],
            futures: List[concurrent.futures.Future],
        ) -> Generator[PushResult, None, None]:
            nonlocal reretrieve
            num_cached = 0
            num_pushes = len(pushes)

            for push, future in zip(pushes, futures):
                cached = future.result()

                # Regenerating a large amount of data when we update the mozci regression detection
                # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we
                # run.
                if cached:
                    value, mozci_version = cached

                    # Regenerate results which were generated with an older version of mozci.
                    if reretrieve > 0 and mozci_version != MOZCI_VERSION:
                        cached = None
                        reretrieve -= 1

                if cached:
                    num_cached += 1
                    value, mozci_version = cached
                    assert len(value) == 5
                    yield value
                else:
                    logger.info(f"Analyzing {push.rev} at the {granularity} level...")

                    key = cache_key(push)

                    try:
                        if granularity == "label":
                            runnables = push.label_summaries.keys()
                        elif granularity == "group":
                            runnables = push.group_summaries.keys()
                        elif granularity == "config_group":
                            runnables = push.config_group_summaries.keys()

                        value = (
                            tuple(push.revs),
                            push.backedoutby or push.bustage_fixed_by,
                            tuple(runnables),
                            tuple(push.get_possible_regressions(granularity)),
                            tuple(push.get_likely_regressions(granularity)),
                        )
                        mozci.config.cache.put(
                            key,
                            (value, MOZCI_VERSION),
                            mozci.config["cache"]["retention"],
                        )
                        assert len(value) == 5
                        yield value
                    except mozci.errors.MissingDataError:
                        logger.warning(
                            f"Tasks for push {push.rev} can't be found on ActiveData"
                        )
                    except Exception:
                        traceback.print_exc()

                progress_bar.update(1)

            logger.info(f"{num_cached} pushes were already cached out of {num_pushes}")

        def retrieve_from_cache(push):
            return mozci.config.cache.get(cache_key(push))

        total_pushes = len(
            mozci.push.make_push_objects(
                from_date=from_date.strftime("%Y-%m-%d"),
                to_date=to_date.strftime("%Y-%m-%d"),
                branch="autoland",
            )
        )

        with concurrent.futures.ThreadPoolExecutor() as executor:
            with tqdm(total=total_pushes) as progress_bar:
                # Run in batches of 7 days to avoid running out of memory (given that mozci pushes
                # consume a lot of memory, and they all have references to each other through "parent"
                # and "child" links so they are basically never released while we run this).
                while from_date < to_date:
                    next_from_date = from_date + relativedelta(days=7)
                    if next_from_date > to_date:
                        next_from_date = to_date

                    pushes = mozci.push.make_push_objects(
                        from_date=from_date.strftime("%Y-%m-%d"),
                        to_date=next_from_date.strftime("%Y-%m-%d"),
                        branch="autoland",
                    )

                    futures = [
                        executor.submit(retrieve_from_cache, push) for push in pushes
                    ]

                    try:
                        db.append(push_data_db, generate(progress_bar, pushes, futures))
                    except Exception:
                        for f in futures:
                            f.cancel()

                        raise

                    from_date = next_from_date

        zstd_compress(push_data_db)
Exemple #35
0
def download_bugs_between(date_from, date_to, security=False):
    products = {
        "Add-on SDK",
        "Android Background Services",
        "Core",
        "Core Graveyard",
        "DevTools",
        "DevTools Graveyard",
        "External Software Affecting Firefox",
        "Firefox",
        "Firefox Graveyard",
        "Firefox Build System",
        "Firefox for Android",
        "Firefox for Android Graveyard",
        # 'Firefox for iOS',
        "Firefox Health Report",
        # 'Focus',
        # 'Hello (Loop)',
        "NSPR",
        "NSS",
        "Toolkit",
        "Toolkit Graveyard",
        "WebExtensions",
    }

    params = {
        "f1": "creation_ts",
        "o1": "greaterthan",
        "v1": date_from.strftime("%Y-%m-%d"),
        "f2": "creation_ts",
        "o2": "lessthan",
        "v2": date_to.strftime("%Y-%m-%d"),
        "product": products,
    }

    if not security:
        params["f3"] = "bug_group"
        params["o3"] = "isempty"

    params["count_only"] = 1
    r = requests.get("https://bugzilla.mozilla.org/rest/bug", params=params)
    r.raise_for_status()
    count = r.json()["bug_count"]
    del params["count_only"]

    params["limit"] = 100
    params["order"] = "bug_id"

    old_bug_ids = set(bug["id"] for bug in get_bugs())

    all_bugs = []

    with tqdm(total=count) as progress_bar:
        for offset in range(0, count, Bugzilla.BUGZILLA_CHUNK_SIZE):
            params["offset"] = offset

            new_bugs = _download(params)

            progress_bar.update(Bugzilla.BUGZILLA_CHUNK_SIZE)

            all_bugs += [bug for bug in new_bugs.values()]

            db.append(
                BUGS_DB,
                (bug for bug_id, bug in new_bugs.items() if bug_id not in old_bug_ids),
            )

    return all_bugs