Ejemplo n.º 1
0
def main(args):
    similarity_model = (
        similarity.download_and_load_similarity_model(args.similaritymodel)
        if args.similaritymodel
        else None
    )
    duplicate_model = DuplicateModel.load("duplicatemodel")
    try:
        with open("duplicate_test_bugs.json", "r") as f:
            test_bugs = json.load(f)
    except FileNotFoundError:
        test_bug_ids = bugzilla.get_ids_between(
            datetime.now() - timedelta(days=21), datetime.now()
        )
        test_bugs = bugzilla.get(test_bug_ids)
        test_bugs = [
            bug
            for bug in test_bugs.values()
            if not bug["creator"] in REPORTERS_TO_IGNORE
        ]
        with open("duplicate_test_bugs.json", "w") as f:
            json.dump(test_bugs, f)

    with open("duplicate_predictions.csv", "w") as csvfile:
        spamwriter = csv.writer(csvfile)

        spamwriter.writerow(
            ["bug 1 ID", "bug 1 summary", "bug 2 ID", "bug 2 summary", "prediction"]
        )
        if similarity_model:
            bug_tuples = []
            for test_bug in test_bugs:
                similar_bug_ids = similarity_model.get_similar_bugs(test_bug)
                similar_bugs = bugzilla.get(similar_bug_ids)
                bug_tuples += [
                    (test_bug, similar_bug) for similar_bug in similar_bugs.values()
                ]
        else:
            bug_tuples = combinations(test_bugs, 2)

        probs = duplicate_model.classify(bug_tuples, probabilities=True)

        for bug_tuple, prob in zip(bug_tuples, probs):
            if prob[1] > similarity_model.confidence_threshold:
                spamwriter.writerow(
                    [
                        f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug_tuple[0]["id"]}',
                        bug_tuple[0]["summary"],
                        f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug_tuple[1]["id"]}',
                        bug_tuple[1]["summary"],
                        prob[1],
                    ]
                )
Ejemplo n.º 2
0
def main(args):

    model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"

    if not os.path.exists(model_file_name):
        logger.info(
            f"{model_file_name} does not exist. Downloading the model....")
        try:
            download_check_etag(URL.format(model_file_name))
        except requests.HTTPError:
            logger.error(
                f"A pre-trained model is not available, you will need to train it yourself using the trainer script"
            )
            raise SystemExit(1)

        zstd_decompress(model_file_name)
        assert os.path.exists(
            model_file_name), "Decompressed file doesn't exist"

    model = similarity.model_name_to_class[args.algorithm].load(
        f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
    )

    bug_ids = model.get_similar_bugs(bugzilla.get(args.bug_id)[args.bug_id])

    bugs = {}
    for bug in bugzilla.get_bugs():
        if bug["id"] in bug_ids or bug["id"] == args.bug_id:
            bugs[bug["id"]] = bug

    print("{}: {}".format(args.bug_id, bugs[args.bug_id]["summary"]))
    for bug_id in bug_ids:
        print("{}: {}".format(bug_id, bugs[bug_id]["summary"]))
Ejemplo n.º 3
0
def classify_bugs(model_name, classifier, bug_id):
    if classifier != "default":
        assert (
            model_name in MODELS_WITH_TYPE
        ), f"{classifier} is not a valid classifier type for {model_name}"

        model_file_name = f"{model_name}{classifier}model"
        model_name = f"{model_name}_{classifier}"
    else:
        model_file_name = f"{model_name}model"

    if not os.path.exists(model_file_name):
        logger.info(f"{model_file_name} does not exist. Downloading the model....")
        try:
            download_check_etag(
                f"https://index.taskcluster.net/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_file_name}.zst",
                f"{model_file_name}.zst",
            )
        except requests.HTTPError:
            logger.error(
                f"A pre-trained model is not available, you will need to train it yourself using the trainer script"
            )
            raise SystemExit(1)

        zstd_decompress(model_file_name)
        assert os.path.exists(model_file_name), "Decompressed file doesn't exist"

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    if bug_id:
        bugs = bugzilla.get(bug_id).values()
        assert bugs, f"A bug with a bug id of {bug_id} was not found"
    else:
        bugs = bugzilla.get_bugs()

    for bug in bugs:
        print(
            f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} '
        )

        if model.calculate_importance:
            probas, importance = model.classify(
                bug, probabilities=True, importances=True
            )

            model.print_feature_importances(
                importance["importances"], class_probabilities=probas
            )

            with open("importance.html", "w") as f:
                f.write(importance["html"])
        else:
            probas = model.classify(bug, probabilities=True, importances=False)

        if np.argmax(probas) == 1:
            print(f"Positive! {probas}")
        else:
            print(f"Negative! {probas}")
        input()
Ejemplo n.º 4
0
def classify_bug(model_name,
                 bug_ids,
                 bugzilla_token,
                 expiration=DEFAULT_EXPIRATION_TTL):
    # This should be called in a process worker so it should be safe to set
    # the token here
    bug_ids_set = set(map(int, bug_ids))
    bugzilla.set_token(bugzilla_token)
    bugs = bugzilla.get(bug_ids)

    redis_url = os.environ.get("REDIS_URL", "redis://localhost/0")
    redis = Redis.from_url(redis_url)

    missing_bugs = bug_ids_set.difference(bugs.keys())

    for bug_id in missing_bugs:
        redis_key = f"result_{model_name}_{bug_id}"

        # TODO: Find a better error format
        encoded_data = json.dumps({"available": False})

        redis.set(redis_key, encoded_data)
        redis.expire(redis_key, expiration)

    if not bugs:
        return "NOK"

    # TODO: Cache the model in the process memory, it's quite hard as the RQ
    # worker is forking before starting
    model = load_model(model_name)

    model_extra_data = model.get_extra_data()

    # TODO: Classify could choke on a single bug which could make the whole
    # job to fails. What should we do here?
    probs = model.classify(list(bugs.values()), True)
    indexes = probs.argmax(axis=-1)
    suggestions = model.clf._le.inverse_transform(indexes)

    probs_list = probs.tolist()
    indexes_list = indexes.tolist()
    suggestions_list = suggestions.tolist()

    for i, bug_id in enumerate(bugs.keys()):
        data = {
            "prob": probs_list[i],
            "index": indexes_list[i],
            "suggestion": suggestions_list[i],
            "extra_data": model_extra_data,
        }

        encoded_data = json.dumps(data)

        redis_key = f"result_{model_name}_{bug_id}"

        redis.set(redis_key, encoded_data)
        redis.expire(redis_key, expiration)

    return "OK"
Ejemplo n.º 5
0
def main(args):
    model = similarity.model_name_to_class[args.algorithm].load(
        f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel"
    )

    print(
        model.get_similar_bugs(
            bugzilla.get(int(args.bug_id))[int(args.bug_id)]))
Ejemplo n.º 6
0
def classify_bug(model_name, bug_ids, bugzilla_token):
    from bugbug_http.app import JobInfo

    # This should be called in a process worker so it should be safe to set
    # the token here
    bug_ids_set = set(map(int, bug_ids))
    bugzilla.set_token(bugzilla_token)
    bugs = bugzilla.get(bug_ids)

    missing_bugs = bug_ids_set.difference(bugs.keys())

    for bug_id in missing_bugs:
        job = JobInfo(classify_bug, model_name, bug_id)

        # TODO: Find a better error format
        encoded_data = json.dumps({"available": False})
        setkey(job.result_key, encoded_data)

    if not bugs:
        return "NOK"

    model = get_model(model_name)

    if not model:
        LOGGER.info("Missing model %r, aborting" % model_name)
        return "NOK"

    model_extra_data = model.get_extra_data()

    # TODO: Classify could choke on a single bug which could make the whole
    # job to fails. What should we do here?
    probs = model.classify(list(bugs.values()), True)
    indexes = probs.argmax(axis=-1)
    suggestions = model.le.inverse_transform(indexes)

    probs_list = probs.tolist()
    indexes_list = indexes.tolist()
    suggestions_list = suggestions.tolist()

    for i, bug_id in enumerate(bugs.keys()):
        data = {
            "prob": probs_list[i],
            "index": indexes_list[i],
            "class": suggestions_list[i],
            "extra_data": model_extra_data,
        }

        encoded_data = json.dumps(data)

        job = JobInfo(classify_bug, model_name, bug_id)
        setkey(job.result_key, encoded_data)

        # Save the bug last change
        setkey(job.change_time_key,
               bugs[bug_id]["last_change_time"],
               expiration=0)

    return "OK"
Ejemplo n.º 7
0
def classify_bug(model_name: str, bug_ids: Sequence[int],
                 bugzilla_token: str) -> str:
    from bugbug_http.app import JobInfo

    # This should be called in a process worker so it should be safe to set
    # the token here
    bug_ids_set = set(map(int, bug_ids))
    bugzilla.set_token(bugzilla_token)

    bugs = {}
    for i in range(0, len(bug_ids), Bugzilla.BUGZILLA_CHUNK_SIZE):
        bugs.update(bugzilla.get(bug_ids[i:(i +
                                            Bugzilla.BUGZILLA_CHUNK_SIZE)]))

    missing_bugs = bug_ids_set.difference(bugs.keys())

    for bug_id in missing_bugs:
        job = JobInfo(classify_bug, model_name, bug_id)

        # TODO: Find a better error format
        setkey(job.result_key, orjson.dumps({"available": False}))

    if not bugs:
        return "NOK"

    model = MODEL_CACHE.get(model_name)

    if not model:
        LOGGER.info("Missing model %r, aborting" % model_name)
        return "NOK"

    model_extra_data = model.get_extra_data()

    # TODO: Classify could choke on a single bug which could make the whole
    # job to fails. What should we do here?
    probs = model.classify(list(bugs.values()), True)
    indexes = probs.argmax(axis=-1)
    suggestions = model.le.inverse_transform(indexes)

    probs_list = probs.tolist()
    indexes_list = indexes.tolist()
    suggestions_list = suggestions.tolist()

    for i, bug_id in enumerate(bugs.keys()):
        data = {
            "prob": probs_list[i],
            "index": indexes_list[i],
            "class": suggestions_list[i],
            "extra_data": model_extra_data,
        }

        job = JobInfo(classify_bug, model_name, bug_id)
        setkey(job.result_key, orjson.dumps(data), compress=True)

        # Save the bug last change
        setkey(job.change_time_key, bugs[bug_id]["last_change_time"].encode())

    return "OK"
Ejemplo n.º 8
0
def classify_bugs(model_name: str, classifier: str, bug_id: int) -> None:
    if classifier != "default":
        assert (
            model_name in MODELS_WITH_TYPE
        ), f"{classifier} is not a valid classifier type for {model_name}"

        model_file_name = f"{model_name}{classifier}model"
        model_name = f"{model_name}_{classifier}"
    else:
        model_file_name = f"{model_name}model"

    if not os.path.exists(model_file_name):
        logger.info(
            f"{model_file_name} does not exist. Downloading the model....")
        try:
            download_model(model_name)
        except requests.HTTPError:
            logger.error(
                "A pre-trained model is not available, you will need to train it yourself using the trainer script"
            )
            raise SystemExit(1)

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    if bug_id:
        bugs = bugzilla.get(bug_id).values()
        assert bugs, f"A bug with a bug id of {bug_id} was not found"
    else:
        assert db.download(bugzilla.BUGS_DB)
        bugs = bugzilla.get_bugs()

    for bug in bugs:
        print(
            f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} '
        )

        if model.calculate_importance:
            probas, importance = model.classify(bug,
                                                probabilities=True,
                                                importances=True)

            model.print_feature_importances(importance["importances"],
                                            class_probabilities=probas)
        else:
            probas = model.classify(bug, probabilities=True, importances=False)

        probability = probas[0]
        pred_index = np.argmax(probability)
        if len(probability) > 2:
            pred_class = model.le.inverse_transform([pred_index])[0]
        else:
            pred_class = "Positive" if pred_index == 1 else "Negative"
        print(f"{pred_class} {probability}")
        input()
Ejemplo n.º 9
0
def generate_sheet(model_name, token, days, threshold):
    model_file_name = f"{model_name}model"

    assert os.path.exists(
        model_file_name
    ), f"{model_file_name} does not exist. Train the model with trainer.py first."

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    today = datetime.utcnow()
    start_date = today - timedelta(days)
    bugzilla.set_token(token)
    bug_ids = bugzilla.get_ids_between(start_date, today)
    bugs = bugzilla.get(bug_ids)

    print(f"Classifying {len(bugs)} bugs...")

    rows = [["Bug", f"{model_name}(model)", model_name, "Title"]]

    for bug in bugs.values():
        p = model.classify(bug, probabilities=True)
        probability = p[0]
        if len(probability) > 2:
            index = np.argmax(probability)
            prediction = model.class_names[index]
        else:
            prediction = "y" if probability[1] >= threshold else "n"

        rows.append(
            [
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}',
                prediction,
                "",
                bug["summary"],
            ]
        )

    os.makedirs("sheets", exist_ok=True)
    with open(
        os.path.join(
            "sheets",
            f'{model_name}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv',
        ),
        "w",
    ) as f:
        writer = csv.writer(f)
        writer.writerows(rows)
Ejemplo n.º 10
0
def fetch_untriaged(args):
    from bugbug import bugzilla

    # Set bugzilla token and download bugs
    bugzilla.set_token(args.token)
    bug_ids = bugzilla.get_ids_between(date.today() -
                                       timedelta(days=args.days_back))
    bugs = bugzilla.get(bug_ids)

    # Get untriaged bugs
    untriaged_bugs = []
    for bug in bugs.values():
        for history in bug["history"]:
            for change in history["changes"]:
                if (change["field_name"] == "component"
                        and change["removed"] == "Untriaged"):
                    untriaged_bugs.append(bug)

    with open("bugs-{}.json".format(datetime.now().strftime("%s")), "w") as f:
        json.dump(untriaged_bugs, f)

    return untriaged_bugs
Ejemplo n.º 11
0
def generate_sheet(model_name, token):
    model_file_name = f"{model_name}model"

    assert os.path.exists(
        model_file_name
    ), f"{model_file_name} does not exist. Train the model with trainer.py first."

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    today = datetime.utcnow()
    a_week_ago = today - timedelta(7)
    bugzilla.set_token(token)
    bug_ids = bugzilla.get_ids_between(a_week_ago, today)
    bugs = bugzilla.get(bug_ids)

    print(f"Classifying {len(bugs)} bugs...")

    rows = [["Bug", f"{model_name}(model)", model_name, "Title"]]

    for bug in bugs.values():
        p = model.classify(bug, probabilities=True)
        rows.append([
            f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}',
            "y" if p[0][1] >= 0.7 else "n",
            "",
            bug["summary"],
        ])

    os.makedirs("sheets", exist_ok=True)
    with open(
            os.path.join(
                "sheets",
                f'{model_name}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv',
            ),
            "w",
    ) as f:
        writer = csv.writer(f)
        writer.writerows(rows)
Ejemplo n.º 12
0
from bugbug import bugzilla
from bugbug.models.duplicate import DuplicateModel

m = DuplicateModel.load("duplicatemodel")

REPORTERS_TO_IGNORE = {"*****@*****.**", "*****@*****.**"}


try:
    with open("duplicate_test_bugs.json", "r") as f:
        test_bugs = json.load(f)
except FileNotFoundError:
    test_bug_ids = bugzilla.get_ids_between(
        datetime.now() - timedelta(days=21), datetime.now()
    )
    test_bugs = bugzilla.get(test_bug_ids)
    test_bugs = [
        bug for bug in test_bugs.values() if not bug["creator"] in REPORTERS_TO_IGNORE
    ]
    with open("duplicate_test_bugs.json", "w") as f:
        json.dump(test_bugs, f)

bug_tuples = list(itertools.combinations(test_bugs, 2))

# Warning: Classifying all the test bugs takes a while
probs = m.classify(bug_tuples, probabilities=True)

with open("duplicate_predictions.csv", "w") as csvfile:
    spamwriter = csv.writer(csvfile)

    spamwriter.writerow(
Ejemplo n.º 13
0
def main(args):
    model_file_name = "{}{}model".format(
        args.goal, "" if args.classifier == "default" else args.classifier
    )

    if args.goal == "component":
        if args.classifier == "default":
            model_class_name = "component"
        else:
            model_class_name = "component_nn"
    else:
        model_class_name = args.goal

    model_class = get_model_class(model_class_name)

    if args.train:
        db.download(bugzilla.BUGS_DB)
        db.download(repository.COMMITS_DB)

        historical_supported_tasks = [
            "defect",
            "bugtype",
            "defectenhancementtask",
            "regression",
        ]

        if args.goal in historical_supported_tasks:
            model = model_class(args.lemmatization, args.historical)
        elif args.goal == "duplicate":
            model = model_class(args.training_set_size, args.lemmatization)
        else:
            model = model_class(args.lemmatization)
        model.train()
    else:
        model = model_class.load(model_file_name)

    if args.classify:
        for bug in bugzilla.get_bugs():
            print(
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} '
            )

            if model.calculate_importance:
                probas, importance = model.classify(
                    bug, probabilities=True, importances=True
                )

                feature_names = model.get_human_readable_feature_names()
                for i, (importance, index, is_positive) in enumerate(
                    importance["importances"]
                ):
                    print(
                        f'{i + 1}. \'{feature_names[int(index)]}\' ({"+" if (is_positive) else "-"}{importance})'
                    )
            else:
                probas = model.classify(bug, probabilities=True, importances=False)

            if np.argmax(probas) == 1:
                print(f"Positive! {probas}")
            else:
                print(f"Negative! {probas}")
            input()

    if args.generate_sheet:
        assert (
            args.token is not None
        ), "A Bugzilla token should be set in order to download bugs"
        today = datetime.utcnow()
        a_week_ago = today - timedelta(7)
        bugzilla.set_token(args.token)
        bug_ids = bugzilla.get_ids_between(a_week_ago, today)
        bugs = bugzilla.get(bug_ids)

        print(f"Classifying {len(bugs)} bugs...")

        rows = [["Bug", f"{args.goal}(model)", args.goal, "Title"]]

        for bug in bugs.values():
            p = model.classify(bug, probabilities=True)
            rows.append(
                [
                    f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}',
                    "y" if p[0][1] >= 0.7 else "n",
                    "",
                    bug["summary"],
                ]
            )

        os.makedirs("sheets", exist_ok=True)
        with open(
            os.path.join(
                "sheets",
                f'{args.goal}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv',
            ),
            "w",
        ) as f:
            writer = csv.writer(f)
            writer.writerows(rows)