예제 #1
0
def fetch_untriaged(args):
    from bugbug import bugzilla

    today = date.today()
    three_months_ago = today - timedelta(days=args.days_back)

    # Set bugzilla token and download bugs
    bugzilla.set_token(args.token)
    bug_ids = bugzilla.download_bugs_between(three_months_ago, today)

    # Get untriaged bugs
    bugs = bugzilla.get_bugs()
    untriaged_bugs = []
    for bug in bugs:
        if bug["id"] not in bug_ids:
            continue

        for history in bug["history"]:
            for change in history["changes"]:
                if (change["field_name"] == "component"
                        and change["removed"] == "Untriaged"):
                    untriaged_bugs.append(bug)

    with open("bugs-{}.json".format(datetime.now().strftime("%s")), "w") as f:
        json.dump(untriaged_bugs, f)

    return untriaged_bugs
예제 #2
0
    def retrieve_bugs(self):
        bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN])

        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info('Downloading bugs from {} to {}'.format(
            two_years_and_six_months_ago, six_months_ago))
        bugzilla.download_bugs_between(two_years_and_six_months_ago,
                                       six_months_ago)

        logger.info('Downloading labelled bugs')
        bug_ids = labels.get_all_bug_ids()
        bugzilla.download_bugs(bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        for i in range(3):
            bug_ids = bug_snapshot.get_inconsistencies()
            if len(bug_ids) == 0:
                break

            logger.info(
                f'Re-downloading {len(bug_ids)} bugs, as they were inconsistent'
            )
            bugzilla.delete_bugs(bug_ids)
            bugzilla.download_bugs(bug_ids)

        self.compress_file('data/bugs.json')
예제 #3
0
    def __init__(self, repo_dir: str) -> None:
        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = download_and_load_model("regressor")

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))
예제 #4
0
def fetch_untriaged(args):
    from bugbug import bugzilla
    today = date.today()
    three_months_ago = today - timedelta(days=args.days_back)

    # Set bugzilla token and download bugs
    bugzilla.set_token(args.token)
    bug_ids = bugzilla.download_bugs_between(three_months_ago, today)

    # Get untriaged bugs
    bugs = bugzilla.get_bugs()
    untriaged_bugs = []
    for bug in bugs:
        if bug['id'] not in bug_ids:
            continue

        for history in bug['history']:
            for change in history['changes']:
                if change['field_name'] == 'component' and change['removed'] == 'Untriaged':
                    untriaged_bugs.append(bug)

    with open('bugs-{}.json'.format(datetime.now().strftime('%s')), 'w') as f:
        json.dump(untriaged_bugs, f)

    return untriaged_bugs
예제 #5
0
    def retrieve_bugs(self):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            "Downloading bugs from {} to {}".format(
                two_years_and_six_months_ago, six_months_ago
            )
        )
        bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago)

        logger.info("Downloading labelled bugs")
        bug_ids = labels.get_all_bug_ids()
        bugzilla.download_bugs(bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        for i in range(3):
            bug_ids = bug_snapshot.get_inconsistencies()
            if len(bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(bug_ids)
            bugzilla.download_bugs(bug_ids)

        self.compress_file("data/bugs.json")
예제 #6
0
파일: models.py 프로젝트: msk4862/bugbug
def classify_bug(model_name,
                 bug_ids,
                 bugzilla_token,
                 expiration=DEFAULT_EXPIRATION_TTL):
    # This should be called in a process worker so it should be safe to set
    # the token here
    bug_ids_set = set(map(int, bug_ids))
    bugzilla.set_token(bugzilla_token)
    bugs = bugzilla.get(bug_ids)

    redis_url = os.environ.get("REDIS_URL", "redis://localhost/0")
    redis = Redis.from_url(redis_url)

    missing_bugs = bug_ids_set.difference(bugs.keys())

    for bug_id in missing_bugs:
        redis_key = f"result_{model_name}_{bug_id}"

        # TODO: Find a better error format
        encoded_data = json.dumps({"available": False})

        redis.set(redis_key, encoded_data)
        redis.expire(redis_key, expiration)

    if not bugs:
        return "NOK"

    # TODO: Cache the model in the process memory, it's quite hard as the RQ
    # worker is forking before starting
    model = load_model(model_name)

    model_extra_data = model.get_extra_data()

    # TODO: Classify could choke on a single bug which could make the whole
    # job to fails. What should we do here?
    probs = model.classify(list(bugs.values()), True)
    indexes = probs.argmax(axis=-1)
    suggestions = model.clf._le.inverse_transform(indexes)

    probs_list = probs.tolist()
    indexes_list = indexes.tolist()
    suggestions_list = suggestions.tolist()

    for i, bug_id in enumerate(bugs.keys()):
        data = {
            "prob": probs_list[i],
            "index": indexes_list[i],
            "suggestion": suggestions_list[i],
            "extra_data": model_extra_data,
        }

        encoded_data = json.dumps(data)

        redis_key = f"result_{model_name}_{bug_id}"

        redis.set(redis_key, encoded_data)
        redis.expire(redis_key, expiration)

    return "OK"
예제 #7
0
파일: models.py 프로젝트: calixteman/bugbug
def classify_bug(model_name, bug_ids, bugzilla_token):
    from bugbug_http.app import JobInfo

    # This should be called in a process worker so it should be safe to set
    # the token here
    bug_ids_set = set(map(int, bug_ids))
    bugzilla.set_token(bugzilla_token)
    bugs = bugzilla.get(bug_ids)

    missing_bugs = bug_ids_set.difference(bugs.keys())

    for bug_id in missing_bugs:
        job = JobInfo(classify_bug, model_name, bug_id)

        # TODO: Find a better error format
        encoded_data = json.dumps({"available": False})
        setkey(job.result_key, encoded_data)

    if not bugs:
        return "NOK"

    model = get_model(model_name)

    if not model:
        LOGGER.info("Missing model %r, aborting" % model_name)
        return "NOK"

    model_extra_data = model.get_extra_data()

    # TODO: Classify could choke on a single bug which could make the whole
    # job to fails. What should we do here?
    probs = model.classify(list(bugs.values()), True)
    indexes = probs.argmax(axis=-1)
    suggestions = model.le.inverse_transform(indexes)

    probs_list = probs.tolist()
    indexes_list = indexes.tolist()
    suggestions_list = suggestions.tolist()

    for i, bug_id in enumerate(bugs.keys()):
        data = {
            "prob": probs_list[i],
            "index": indexes_list[i],
            "class": suggestions_list[i],
            "extra_data": model_extra_data,
        }

        encoded_data = json.dumps(data)

        job = JobInfo(classify_bug, model_name, bug_id)
        setkey(job.result_key, encoded_data)

        # Save the bug last change
        setkey(job.change_time_key,
               bugs[bug_id]["last_change_time"],
               expiration=0)

    return "OK"
예제 #8
0
파일: models.py 프로젝트: pdehaan/bugbug
def classify_bug(model_name: str, bug_ids: Sequence[int],
                 bugzilla_token: str) -> str:
    from bugbug_http.app import JobInfo

    # This should be called in a process worker so it should be safe to set
    # the token here
    bug_ids_set = set(map(int, bug_ids))
    bugzilla.set_token(bugzilla_token)

    bugs = {}
    for i in range(0, len(bug_ids), Bugzilla.BUGZILLA_CHUNK_SIZE):
        bugs.update(bugzilla.get(bug_ids[i:(i +
                                            Bugzilla.BUGZILLA_CHUNK_SIZE)]))

    missing_bugs = bug_ids_set.difference(bugs.keys())

    for bug_id in missing_bugs:
        job = JobInfo(classify_bug, model_name, bug_id)

        # TODO: Find a better error format
        setkey(job.result_key, orjson.dumps({"available": False}))

    if not bugs:
        return "NOK"

    model = MODEL_CACHE.get(model_name)

    if not model:
        LOGGER.info("Missing model %r, aborting" % model_name)
        return "NOK"

    model_extra_data = model.get_extra_data()

    # TODO: Classify could choke on a single bug which could make the whole
    # job to fails. What should we do here?
    probs = model.classify(list(bugs.values()), True)
    indexes = probs.argmax(axis=-1)
    suggestions = model.le.inverse_transform(indexes)

    probs_list = probs.tolist()
    indexes_list = indexes.tolist()
    suggestions_list = suggestions.tolist()

    for i, bug_id in enumerate(bugs.keys()):
        data = {
            "prob": probs_list[i],
            "index": indexes_list[i],
            "class": suggestions_list[i],
            "extra_data": model_extra_data,
        }

        job = JobInfo(classify_bug, model_name, bug_id)
        setkey(job.result_key, orjson.dumps(data), compress=True)

        # Save the bug last change
        setkey(job.change_time_key, bugs[bug_id]["last_change_time"].encode())

    return "OK"
예제 #9
0
    def retrieve_bugs(self):
        bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN])

        six_months_ago = datetime.utcnow() - timedelta(182)
        two_years_and_six_months_ago = six_months_ago - timedelta(365)
        bugzilla.download_bugs_between(two_years_and_six_months_ago,
                                       six_months_ago)

        bug_ids = labels.get_all_bug_ids()
        bugzilla.download_bugs(bug_ids)

        self.compress_file('data/bugs.json')
예제 #10
0
    def go(self):
        # Download models that were trained by bugbug_train.
        with ThreadPoolExecutorResult(max_workers=3) as executor:
            f1 = executor.submit(lambda: urlretrieve(
                'https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/bug.model.xz',
                'bug.model.xz'))  # noqa
            f1.add_done_callback(lambda f: self.decompress_file('bug.model'))

            f2 = executor.submit(lambda: urlretrieve(
                'https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/regression.model.xz',
                'regression.model.xz'))  # noqa
            f2.add_done_callback(
                lambda f: self.decompress_file('regression.model'))

            f3 = executor.submit(lambda: urlretrieve(
                'https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/tracking.model.xz',
                'tracking.model.xz'))  # noqa
            f3.add_done_callback(
                lambda f: self.decompress_file('tracking.model'))

        # Download bugs from the last week that we want to analyze.
        bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN])

        today = datetime.utcnow()
        one_week_ago = today - timedelta(7)
        bugzilla.download_bugs_between(one_week_ago, today)

        # Eval classifier for bug-vs-nonbug.
        self.eval_bug()

        # Eval classifier for regression-vs-nonregression.
        self.eval_regression()

        # Eval classifier for tracking bugs.
        self.eval_tracking()

        # Index the task in the TaskCluster index.
        self.index_service.insertTask(
            'project.releng.services.project.{}.bugbug_eval.latest'.format(
                secrets[secrets.APP_CHANNEL]), {
                    'taskId':
                    os.environ['TASK_ID'],
                    'rank':
                    0,
                    'data': {},
                    'expires':
                    (datetime.utcnow() +
                     timedelta(31)).strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
                })
예제 #11
0
    def retrieve_bugs(self):
        bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN])

        six_months_ago = datetime.utcnow() - timedelta(182)
        two_years_and_six_months_ago = six_months_ago - timedelta(365)
        logger.info('Downloading bugs from {} to {}'.format(
            two_years_and_six_months_ago, six_months_ago))
        bugzilla.download_bugs_between(two_years_and_six_months_ago,
                                       six_months_ago)

        logger.info('Downloading labelled bugs')
        bug_ids = labels.get_all_bug_ids()
        bugzilla.download_bugs(bug_ids)

        self.compress_file('data/bugs.json')
예제 #12
0
def generate_sheet(model_name, token, days, threshold):
    model_file_name = f"{model_name}model"

    assert os.path.exists(
        model_file_name
    ), f"{model_file_name} does not exist. Train the model with trainer.py first."

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    today = datetime.utcnow()
    start_date = today - timedelta(days)
    bugzilla.set_token(token)
    bug_ids = bugzilla.get_ids_between(start_date, today)
    bugs = bugzilla.get(bug_ids)

    print(f"Classifying {len(bugs)} bugs...")

    rows = [["Bug", f"{model_name}(model)", model_name, "Title"]]

    for bug in bugs.values():
        p = model.classify(bug, probabilities=True)
        probability = p[0]
        if len(probability) > 2:
            index = np.argmax(probability)
            prediction = model.class_names[index]
        else:
            prediction = "y" if probability[1] >= threshold else "n"

        rows.append(
            [
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}',
                prediction,
                "",
                bug["summary"],
            ]
        )

    os.makedirs("sheets", exist_ok=True)
    with open(
        os.path.join(
            "sheets",
            f'{model_name}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv',
        ),
        "w",
    ) as f:
        writer = csv.writer(f)
        writer.writerows(rows)
예제 #13
0
파일: app.py 프로젝트: mozilla/bugbug
def get_bugs_last_change_time(bug_ids):
    bugzilla.set_token(BUGZILLA_TOKEN)

    old_CHUNK_SIZE = Bugzilla.BUGZILLA_CHUNK_SIZE
    try:
        Bugzilla.BUGZILLA_CHUNK_SIZE = 700

        bugs = {}

        def bughandler(bug):
            bugs[bug["id"]] = bug["last_change_time"]

        Bugzilla(
            bugids=bug_ids,
            bughandler=bughandler,
            include_fields=["id", "last_change_time"],
        ).get_data().wait()
    finally:
        Bugzilla.BUGZILLA_CHUNK_SIZE = old_CHUNK_SIZE

    return bugs
    def __init__(self, repo_dir: str) -> None:
        self.risk_bands = sorted(
            (
                parse_risk_band(risk_band)
                for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")
            ),
            key=lambda x: x[1],
        )

        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = cast(
            RegressorModel, RegressorModel.load(download_model("regressor"))
        )

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(
            get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")
        )
예제 #15
0
def fetch_untriaged(args):
    from bugbug import bugzilla

    # Set bugzilla token and download bugs
    bugzilla.set_token(args.token)
    bug_ids = bugzilla.get_ids_between(date.today() -
                                       timedelta(days=args.days_back))
    bugs = bugzilla.get(bug_ids)

    # Get untriaged bugs
    untriaged_bugs = []
    for bug in bugs.values():
        for history in bug["history"]:
            for change in history["changes"]:
                if (change["field_name"] == "component"
                        and change["removed"] == "Untriaged"):
                    untriaged_bugs.append(bug)

    with open("bugs-{}.json".format(datetime.now().strftime("%s")), "w") as f:
        json.dump(untriaged_bugs, f)

    return untriaged_bugs
예제 #16
0
def generate_sheet(model_name, token):
    model_file_name = f"{model_name}model"

    assert os.path.exists(
        model_file_name
    ), f"{model_file_name} does not exist. Train the model with trainer.py first."

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    today = datetime.utcnow()
    a_week_ago = today - timedelta(7)
    bugzilla.set_token(token)
    bug_ids = bugzilla.get_ids_between(a_week_ago, today)
    bugs = bugzilla.get(bug_ids)

    print(f"Classifying {len(bugs)} bugs...")

    rows = [["Bug", f"{model_name}(model)", model_name, "Title"]]

    for bug in bugs.values():
        p = model.classify(bug, probabilities=True)
        rows.append([
            f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}',
            "y" if p[0][1] >= 0.7 else "n",
            "",
            bug["summary"],
        ])

    os.makedirs("sheets", exist_ok=True)
    with open(
            os.path.join(
                "sheets",
                f'{model_name}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv',
            ),
            "w",
    ) as f:
        writer = csv.writer(f)
        writer.writerows(rows)
예제 #17
0
    def go(self):
        # Download models that were trained by bugbug_train.
        with ThreadPoolExecutorResult(max_workers=3) as executor:
            f1 = executor.submit(lambda: urlretrieve('https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/bugmodel.xz', 'bugmodel.xz'))  # noqa
            f1.add_done_callback(lambda f: self.decompress_file('bugmodel'))

            f2 = executor.submit(lambda: urlretrieve('https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/regressionmodel.xz', 'regressionmodel.xz'))  # noqa
            f2.add_done_callback(lambda f: self.decompress_file('regressionmodel'))

            f3 = executor.submit(lambda: urlretrieve('https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/trackingmodel.xz', 'trackingmodel.xz'))  # noqa
            f3.add_done_callback(lambda f: self.decompress_file('trackingmodel'))

        # Download bugs from the last week that we want to analyze.
        bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN])

        today = datetime.utcnow()
        one_week_ago = today - timedelta(7)
        bugzilla.download_bugs_between(one_week_ago, today)

        # Eval classifier for bug-vs-nonbug.
        self.eval_bug()

        # Eval classifier for regression-vs-nonregression.
        self.eval_regression()

        # Eval classifier for tracking bugs.
        self.eval_tracking()

        # Index the task in the TaskCluster index.
        self.index_service.insertTask(
            'project.releng.services.project.{}.bugbug_eval.latest'.format(secrets[secrets.APP_CHANNEL]),
            {
                'taskId': os.environ['TASK_ID'],
                'rank': 0,
                'data': {},
                'expires': (datetime.utcnow() + timedelta(31)).strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
            }
        )
    def __init__(self, repo_dir: str) -> None:
        self.risk_bands = sorted(
            (parse_risk_band(risk_band)
             for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")),
            key=lambda x: x[1],
        )

        repository.clone(repo_dir)

        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB, support_files_too=True)

        logger.info("Updating commits DB...")
        for commit in repository.get_commits():
            pass

        repository.download_commits(
            repo_dir,
            rev_start="children({})".format(commit["node"]),
        )

        # Some commits that were already in the DB from the previous run might need
        # to be updated (e.g. coverage information).
        repository.update_commits()

        logger.info("Downloading revisions database...")
        assert db.download(phabricator.REVISIONS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download commit classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)

        self.regressor_model = cast(
            RegressorModel, RegressorModel.load(download_model("regressor")))

        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
        phabricator.set_api_key(get_secret("PHABRICATOR_URL"),
                                get_secret("PHABRICATOR_TOKEN"))

        self.path_to_component = repository.get_component_mapping()

        self.past_regressions_by = {}
        self.past_fixed_bugs_by = {}
        self.past_regression_blocked_bugs_by = {}
        self.past_fixed_bug_blocked_bugs_by = {}

        for dimension in ["component", "directory", "file", "function"]:
            self.past_regressions_by[dimension] = _download_past_bugs(
                PAST_REGRESSIONS_BY_URL.format(dimension=dimension))
            self.past_fixed_bugs_by[dimension] = _download_past_bugs(
                PAST_FIXED_BUGS_BY_URL.format(dimension=dimension))
            self.past_regression_blocked_bugs_by[
                dimension] = _download_past_bugs(
                    PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format(
                        dimension=dimension))
            self.past_fixed_bug_blocked_bugs_by[
                dimension] = _download_past_bugs(
                    PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format(
                        dimension=dimension))
예제 #19
0
def main(args):
    model_file_name = "{}{}model".format(
        args.goal, "" if args.classifier == "default" else args.classifier)

    if args.goal == "component":
        if args.classifier == "default":
            model_class_name = "component"
        else:
            model_class_name = "component_nn"
    else:
        model_class_name = args.goal

    model_class = get_model_class(model_class_name)

    if args.train:
        db.download(bugzilla.BUGS_DB)
        db.download(repository.COMMITS_DB)

        historical_supported_tasks = [
            "defect",
            "bugtype",
            "defectenhancementtask",
            "regression",
        ]

        if args.goal in historical_supported_tasks:
            model = model_class(args.lemmatization, args.historical)
        elif args.goal == "duplicate":
            model = model_class(args.training_set_size, args.lemmatization)
        else:
            model = model_class(args.lemmatization)
        model.train()
    else:
        model = model_class.load(model_file_name)

    if args.classify:
        for bug in bugzilla.get_bugs():
            print(
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} '
            )

            if model.calculate_importance:
                probas, importance = model.classify(bug,
                                                    probabilities=True,
                                                    importances=True)

                feature_names = model.get_feature_names()
                for i, (importance, index,
                        is_positive) in enumerate(importance["importances"]):
                    print(
                        f'{i + 1}. \'{feature_names[int(index)]}\' ({"+" if (is_positive) else "-"}{importance})'
                    )
            else:
                probas = model.classify(bug,
                                        probabilities=True,
                                        importances=False)

            if np.argmax(probas) == 1:
                print(f"Positive! {probas}")
            else:
                print(f"Negative! {probas}")
            input()

    if args.generate_sheet:
        assert (args.token is not None
                ), "A Bugzilla token should be set in order to download bugs"
        today = datetime.utcnow()
        a_week_ago = today - timedelta(7)
        bugzilla.set_token(args.token)
        bugs = bugzilla.download_bugs_between(a_week_ago, today)

        print(f"Classifying {len(bugs)} bugs...")

        rows = [["Bug", f"{args.goal}(model)", args.goal, "Title"]]

        for bug in bugs:
            p = model.classify(bug, probabilities=True)
            rows.append([
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}',
                "y" if p[0][1] >= 0.7 else "n",
                "",
                bug["summary"],
            ])

        os.makedirs("sheets", exist_ok=True)
        with open(
                os.path.join(
                    "sheets",
                    f'{args.goal}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv',
                ),
                "w",
        ) as f:
            writer = csv.writer(f)
            writer.writerows(rows)
예제 #20
0
파일: run.py 프로젝트: mythmon/bugbug
            else:
                probas = model.classify(bug, probabilities=True, importances=False)

            if np.argmax(probas) == 1:
                print(f"Positive! {probas}")
            else:
                print(f"Negative! {probas}")
            input()

    if args.generate_sheet:
        assert (
            args.token is not None
        ), "A Bugzilla token should be set in order to download bugs"
        today = datetime.utcnow()
        a_week_ago = today - timedelta(7)
        bugzilla.set_token(args.token)
        bugs = bugzilla.download_bugs_between(a_week_ago, today)

        print(f"Classifying {len(bugs)} bugs...")

        rows = [["Bug", f"{args.goal}(model)", args.goal, "Title"]]

        for bug in bugs:
            p = model.classify(bug, probabilities=True)
            rows.append(
                [
                    f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}',
                    "y" if p[0][1] >= 0.7 else "n",
                    "",
                    bug["summary"],
                ]
예제 #21
0
    def retrieve_bugs(self, limit: int = None) -> None:
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = set(
            bugzilla.get_ids({
                "f1": "delta_ts",
                "o1": "greaterthaneq",
                "v1": last_modified.date()
            }))
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        all_components = bugzilla.get_product_component_count(9999)

        deleted_component_ids = set(
            bug["id"] for bug in bugzilla.get_bugs() if "{}::{}".format(
                bug["product"], bug["component"]) not in all_components)
        logger.info(
            f"{len(deleted_component_ids)} bugs belonging to deleted components"
        )
        changed_ids |= deleted_component_ids

        # Get IDs of bugs between (two years and six months ago) and now.
        two_years_and_six_months_ago = datetime.utcnow() - relativedelta(
            years=2, months=6)
        logger.info(f"Retrieving bug IDs since {two_years_and_six_months_ago}")
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago)
        if limit:
            timespan_ids = timespan_ids[-limit:]
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        if limit:
            labelled_bug_ids = labelled_bug_ids[-limit:]
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
        # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped.
        if limit is None:
            assert db.download(repository.COMMITS_DB)

        # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
        start_date = datetime.now() - relativedelta(years=3)
        commit_bug_ids = list(
            set(commit["bug_id"] for commit in repository.get_commits()
                if commit["bug_id"]
                and dateutil.parser.parse(commit["pushdate"]) >= start_date))
        if limit:
            commit_bug_ids = commit_bug_ids[-limit:]
        logger.info(
            f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which are regressions, bugs which caused regressions (useful for the regressor model),
        # and blocked bugs.
        regression_related_ids: List[int] = list(
            set(
                sum(
                    (bug["regressed_by"] + bug["regressions"] + bug["blocks"]
                     for bug in bugzilla.get_bugs()),
                    [],
                )))
        if limit:
            regression_related_ids = regression_related_ids[-limit:]
        logger.info(
            f"{len(regression_related_ids)} bugs which caused regressions fixed by commits."
        )

        # Get IDs of bugs linked to intermittent failures.
        test_failure_bug_ids = [
            item["bug_id"] for item in test_scheduling.get_failure_bugs(
                two_years_and_six_months_ago, datetime.utcnow())
        ]
        if limit:
            test_failure_bug_ids = test_failure_bug_ids[-limit:]
        logger.info(f"{len(test_failure_bug_ids)} bugs about test failures.")

        all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids +
                   regression_related_ids + test_failure_bug_ids)
        all_ids_set = set(all_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"]
                             not in all_ids_set)

        new_bugs = bugzilla.download_bugs(all_ids)

        # Get regression_related_ids again (the set could have changed after downloading new bugs).
        for i in range(7):
            regression_related_ids = list(
                set(
                    sum(
                        (bug["regressed_by"] + bug["regressions"] +
                         bug["blocks"] for bug in new_bugs),
                        [],
                    )))
            logger.info(
                f"{len(regression_related_ids)} bugs which caused regressions fixed by commits."
            )
            if limit:
                regression_related_ids = regression_related_ids[-limit:]

            # If we got all bugs we needed, break.
            if set(regression_related_ids).issubset(all_ids):
                break

            new_bugs = bugzilla.download_bugs(regression_related_ids)

        # Try to re-download inconsistent bugs, up to twice.
        inconsistent_bugs = bugzilla.get_bugs(include_invalid=True)
        for i in range(2):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        # TODO: Figure out why.
        missing_history_bug_ids = {
            bug["id"]
            for bug in bugzilla.get_bugs() if "history" not in bug
        }
        bugzilla.delete_bugs(lambda bug: bug["id"] in missing_history_bug_ids)
        logger.info(
            f"Deleted {len(missing_history_bug_ids)} bugs as we couldn't retrieve their history"
        )

        zstd_compress(bugzilla.BUGS_DB)
예제 #22
0
    def retrieve_bugs(self, limit=None):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids(
            {"f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date()}
        )
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(
            two_years_and_six_months_ago, six_months_ago
        )
        if limit:
            timespan_ids = timespan_ids[:limit]
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        if limit:
            labelled_bug_ids = labelled_bug_ids[:limit]
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
        # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped.
        if limit is None:
            assert db.download(repository.COMMITS_DB)

        # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
        start_date = datetime.now() - relativedelta(years=3)
        commit_bug_ids = [
            commit["bug_id"]
            for commit in repository.get_commits()
            if commit["bug_id"]
            and dateutil.parser.parse(commit["pushdate"]) >= start_date
        ]
        if limit:
            commit_bug_ids = commit_bug_ids[-limit:]
        logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which are regressions and bugs which caused regressions (useful for the regressor model).
        regressed_by_bug_ids = sum(
            (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()),
            [],
        )
        if limit:
            regressed_by_bug_ids = regressed_by_bug_ids[-limit:]
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        all_ids = (
            timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids
        )
        all_ids_set = set(all_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(
            lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set
        )

        bugzilla.download_bugs(all_ids)

        # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs).
        regressed_by_bug_ids = sum(
            (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()),
            [],
        )
        logger.info(
            f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
        )

        bugzilla.download_bugs(regressed_by_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs(include_invalid=True)
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        zstd_compress("data/bugs.json")
예제 #23
0
                probas = model.classify(bug,
                                        probabilities=True,
                                        importances=False)

            if np.argmax(probas) == 1:
                print(f"Positive! {probas}")
            else:
                print(f"Negative! {probas}")
            input()

    if args.generate_sheet:
        assert (args.token is not None
                ), "A Bugzilla token should be set in order to download bugs"
        today = datetime.utcnow()
        a_week_ago = today - timedelta(7)
        bugzilla.set_token(args.token)
        bugs = bugzilla.download_bugs_between(a_week_ago, today)

        print(f"Classifying {len(bugs)} bugs...")

        rows = [["Bug", f"{args.goal}(model)", args.goal, "Title"]]

        for bug in bugs:
            p = model.classify(bug, probabilities=True)
            rows.append([
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}',
                "y" if p[0][1] >= 0.7 else "n",
                "",
                bug["summary"],
            ])
예제 #24
0
    def retrieve_bugs(self):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        db.download_version(bugzilla.BUGS_DB)
        if not db.is_old_version(bugzilla.BUGS_DB):
            db.download(bugzilla.BUGS_DB)

        # Get IDs of bugs changed since last run.
        last_modified = db.last_modified(bugzilla.BUGS_DB)
        logger.info(
            f"Retrieving IDs of bugs modified since the last run on {last_modified}"
        )
        changed_ids = bugzilla.get_ids({
            "f1": "delta_ts",
            "o1": "greaterthaneq",
            "v1": last_modified.date()
        })
        logger.info(f"Retrieved {len(changed_ids)} IDs.")

        # Get IDs of bugs between (two years and six months ago) and (six months ago).
        six_months_ago = datetime.utcnow() - relativedelta(months=6)
        two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
        logger.info(
            f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
        )
        timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago,
                                                six_months_ago)
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        all_ids = set(timespan_ids + labelled_bug_ids)

        # We have to redownload bugs that were changed since the last download.
        # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
        bugzilla.delete_bugs(
            lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids)

        bugzilla.download_bugs(timespan_ids + labelled_bug_ids)

        # Try to re-download inconsistent bugs, up to three times.
        inconsistent_bugs = bugzilla.get_bugs()
        for i in range(3):
            # We look for inconsistencies in all bugs first, then, on following passes,
            # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
            inconsistent_bugs = bug_snapshot.get_inconsistencies(
                inconsistent_bugs)
            inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)

            if len(inconsistent_bug_ids) == 0:
                break

            logger.info(
                f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
            )
            bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
            bugzilla.download_bugs(inconsistent_bug_ids)

        self.compress_file("data/bugs.json")
예제 #25
0
파일: app.py 프로젝트: rhcu/bugbug
# You can obtain one at http://mozilla.org/MPL/2.0/.

import logging
import os

from flask import Flask, current_app, jsonify, request

from bugbug import bugzilla

from .models import load_model

API_TOKEN = "X-Api-Key"

application = Flask(__name__)

bugzilla.set_token(os.environ.get("BUGBUG_BUGZILLA_TOKEN"))

logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger()


def get_model(model):
    attribute = f"bugbug_model_{model}"

    if not hasattr(current_app, attribute):
        model = load_model(model)
        setattr(current_app, attribute, model)

    return getattr(current_app, attribute, model)