def main(args): similarity_model = ( similarity.download_and_load_similarity_model(args.similaritymodel) if args.similaritymodel else None ) duplicate_model = DuplicateModel.load("duplicatemodel") try: with open("duplicate_test_bugs.json", "r") as f: test_bugs = json.load(f) except FileNotFoundError: test_bug_ids = bugzilla.get_ids_between( datetime.now() - timedelta(days=21), datetime.now() ) test_bugs = bugzilla.get(test_bug_ids) test_bugs = [ bug for bug in test_bugs.values() if not bug["creator"] in REPORTERS_TO_IGNORE ] with open("duplicate_test_bugs.json", "w") as f: json.dump(test_bugs, f) with open("duplicate_predictions.csv", "w") as csvfile: spamwriter = csv.writer(csvfile) spamwriter.writerow( ["bug 1 ID", "bug 1 summary", "bug 2 ID", "bug 2 summary", "prediction"] ) if similarity_model: bug_tuples = [] for test_bug in test_bugs: similar_bug_ids = similarity_model.get_similar_bugs(test_bug) similar_bugs = bugzilla.get(similar_bug_ids) bug_tuples += [ (test_bug, similar_bug) for similar_bug in similar_bugs.values() ] else: bug_tuples = combinations(test_bugs, 2) probs = duplicate_model.classify(bug_tuples, probabilities=True) for bug_tuple, prob in zip(bug_tuples, probs): if prob[1] > similarity_model.confidence_threshold: spamwriter.writerow( [ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug_tuple[0]["id"]}', bug_tuple[0]["summary"], f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug_tuple[1]["id"]}', bug_tuple[1]["summary"], prob[1], ] )
def main(args): model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" if not os.path.exists(model_file_name): logger.info( f"{model_file_name} does not exist. Downloading the model....") try: download_check_etag(URL.format(model_file_name)) except requests.HTTPError: logger.error( f"A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) zstd_decompress(model_file_name) assert os.path.exists( model_file_name), "Decompressed file doesn't exist" model = similarity.model_name_to_class[args.algorithm].load( f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" ) bug_ids = model.get_similar_bugs(bugzilla.get(args.bug_id)[args.bug_id]) bugs = {} for bug in bugzilla.get_bugs(): if bug["id"] in bug_ids or bug["id"] == args.bug_id: bugs[bug["id"]] = bug print("{}: {}".format(args.bug_id, bugs[args.bug_id]["summary"])) for bug_id in bug_ids: print("{}: {}".format(bug_id, bugs[bug_id]["summary"]))
def classify_bugs(model_name, classifier, bug_id): if classifier != "default": assert ( model_name in MODELS_WITH_TYPE ), f"{classifier} is not a valid classifier type for {model_name}" model_file_name = f"{model_name}{classifier}model" model_name = f"{model_name}_{classifier}" else: model_file_name = f"{model_name}model" if not os.path.exists(model_file_name): logger.info(f"{model_file_name} does not exist. Downloading the model....") try: download_check_etag( f"https://index.taskcluster.net/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_file_name}.zst", f"{model_file_name}.zst", ) except requests.HTTPError: logger.error( f"A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) zstd_decompress(model_file_name) assert os.path.exists(model_file_name), "Decompressed file doesn't exist" model_class = get_model_class(model_name) model = model_class.load(model_file_name) if bug_id: bugs = bugzilla.get(bug_id).values() assert bugs, f"A bug with a bug id of {bug_id} was not found" else: bugs = bugzilla.get_bugs() for bug in bugs: print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify( bug, probabilities=True, importances=True ) model.print_feature_importances( importance["importances"], class_probabilities=probas ) with open("importance.html", "w") as f: f.write(importance["html"]) else: probas = model.classify(bug, probabilities=True, importances=False) if np.argmax(probas) == 1: print(f"Positive! {probas}") else: print(f"Negative! {probas}") input()
def classify_bug(model_name, bug_ids, bugzilla_token, expiration=DEFAULT_EXPIRATION_TTL): # This should be called in a process worker so it should be safe to set # the token here bug_ids_set = set(map(int, bug_ids)) bugzilla.set_token(bugzilla_token) bugs = bugzilla.get(bug_ids) redis_url = os.environ.get("REDIS_URL", "redis://localhost/0") redis = Redis.from_url(redis_url) missing_bugs = bug_ids_set.difference(bugs.keys()) for bug_id in missing_bugs: redis_key = f"result_{model_name}_{bug_id}" # TODO: Find a better error format encoded_data = json.dumps({"available": False}) redis.set(redis_key, encoded_data) redis.expire(redis_key, expiration) if not bugs: return "NOK" # TODO: Cache the model in the process memory, it's quite hard as the RQ # worker is forking before starting model = load_model(model_name) model_extra_data = model.get_extra_data() # TODO: Classify could choke on a single bug which could make the whole # job to fails. What should we do here? probs = model.classify(list(bugs.values()), True) indexes = probs.argmax(axis=-1) suggestions = model.clf._le.inverse_transform(indexes) probs_list = probs.tolist() indexes_list = indexes.tolist() suggestions_list = suggestions.tolist() for i, bug_id in enumerate(bugs.keys()): data = { "prob": probs_list[i], "index": indexes_list[i], "suggestion": suggestions_list[i], "extra_data": model_extra_data, } encoded_data = json.dumps(data) redis_key = f"result_{model_name}_{bug_id}" redis.set(redis_key, encoded_data) redis.expire(redis_key, expiration) return "OK"
def main(args): model = similarity.model_name_to_class[args.algorithm].load( f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" ) print( model.get_similar_bugs( bugzilla.get(int(args.bug_id))[int(args.bug_id)]))
def classify_bug(model_name, bug_ids, bugzilla_token): from bugbug_http.app import JobInfo # This should be called in a process worker so it should be safe to set # the token here bug_ids_set = set(map(int, bug_ids)) bugzilla.set_token(bugzilla_token) bugs = bugzilla.get(bug_ids) missing_bugs = bug_ids_set.difference(bugs.keys()) for bug_id in missing_bugs: job = JobInfo(classify_bug, model_name, bug_id) # TODO: Find a better error format encoded_data = json.dumps({"available": False}) setkey(job.result_key, encoded_data) if not bugs: return "NOK" model = get_model(model_name) if not model: LOGGER.info("Missing model %r, aborting" % model_name) return "NOK" model_extra_data = model.get_extra_data() # TODO: Classify could choke on a single bug which could make the whole # job to fails. What should we do here? probs = model.classify(list(bugs.values()), True) indexes = probs.argmax(axis=-1) suggestions = model.le.inverse_transform(indexes) probs_list = probs.tolist() indexes_list = indexes.tolist() suggestions_list = suggestions.tolist() for i, bug_id in enumerate(bugs.keys()): data = { "prob": probs_list[i], "index": indexes_list[i], "class": suggestions_list[i], "extra_data": model_extra_data, } encoded_data = json.dumps(data) job = JobInfo(classify_bug, model_name, bug_id) setkey(job.result_key, encoded_data) # Save the bug last change setkey(job.change_time_key, bugs[bug_id]["last_change_time"], expiration=0) return "OK"
def classify_bug(model_name: str, bug_ids: Sequence[int], bugzilla_token: str) -> str: from bugbug_http.app import JobInfo # This should be called in a process worker so it should be safe to set # the token here bug_ids_set = set(map(int, bug_ids)) bugzilla.set_token(bugzilla_token) bugs = {} for i in range(0, len(bug_ids), Bugzilla.BUGZILLA_CHUNK_SIZE): bugs.update(bugzilla.get(bug_ids[i:(i + Bugzilla.BUGZILLA_CHUNK_SIZE)])) missing_bugs = bug_ids_set.difference(bugs.keys()) for bug_id in missing_bugs: job = JobInfo(classify_bug, model_name, bug_id) # TODO: Find a better error format setkey(job.result_key, orjson.dumps({"available": False})) if not bugs: return "NOK" model = MODEL_CACHE.get(model_name) if not model: LOGGER.info("Missing model %r, aborting" % model_name) return "NOK" model_extra_data = model.get_extra_data() # TODO: Classify could choke on a single bug which could make the whole # job to fails. What should we do here? probs = model.classify(list(bugs.values()), True) indexes = probs.argmax(axis=-1) suggestions = model.le.inverse_transform(indexes) probs_list = probs.tolist() indexes_list = indexes.tolist() suggestions_list = suggestions.tolist() for i, bug_id in enumerate(bugs.keys()): data = { "prob": probs_list[i], "index": indexes_list[i], "class": suggestions_list[i], "extra_data": model_extra_data, } job = JobInfo(classify_bug, model_name, bug_id) setkey(job.result_key, orjson.dumps(data), compress=True) # Save the bug last change setkey(job.change_time_key, bugs[bug_id]["last_change_time"].encode()) return "OK"
def classify_bugs(model_name: str, classifier: str, bug_id: int) -> None: if classifier != "default": assert ( model_name in MODELS_WITH_TYPE ), f"{classifier} is not a valid classifier type for {model_name}" model_file_name = f"{model_name}{classifier}model" model_name = f"{model_name}_{classifier}" else: model_file_name = f"{model_name}model" if not os.path.exists(model_file_name): logger.info( f"{model_file_name} does not exist. Downloading the model....") try: download_model(model_name) except requests.HTTPError: logger.error( "A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) model_class = get_model_class(model_name) model = model_class.load(model_file_name) if bug_id: bugs = bugzilla.get(bug_id).values() assert bugs, f"A bug with a bug id of {bug_id} was not found" else: assert db.download(bugzilla.BUGS_DB) bugs = bugzilla.get_bugs() for bug in bugs: print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify(bug, probabilities=True, importances=True) model.print_feature_importances(importance["importances"], class_probabilities=probas) else: probas = model.classify(bug, probabilities=True, importances=False) probability = probas[0] pred_index = np.argmax(probability) if len(probability) > 2: pred_class = model.le.inverse_transform([pred_index])[0] else: pred_class = "Positive" if pred_index == 1 else "Negative" print(f"{pred_class} {probability}") input()
def generate_sheet(model_name, token, days, threshold): model_file_name = f"{model_name}model" assert os.path.exists( model_file_name ), f"{model_file_name} does not exist. Train the model with trainer.py first." model_class = get_model_class(model_name) model = model_class.load(model_file_name) today = datetime.utcnow() start_date = today - timedelta(days) bugzilla.set_token(token) bug_ids = bugzilla.get_ids_between(start_date, today) bugs = bugzilla.get(bug_ids) print(f"Classifying {len(bugs)} bugs...") rows = [["Bug", f"{model_name}(model)", model_name, "Title"]] for bug in bugs.values(): p = model.classify(bug, probabilities=True) probability = p[0] if len(probability) > 2: index = np.argmax(probability) prediction = model.class_names[index] else: prediction = "y" if probability[1] >= threshold else "n" rows.append( [ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}', prediction, "", bug["summary"], ] ) os.makedirs("sheets", exist_ok=True) with open( os.path.join( "sheets", f'{model_name}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv', ), "w", ) as f: writer = csv.writer(f) writer.writerows(rows)
def fetch_untriaged(args): from bugbug import bugzilla # Set bugzilla token and download bugs bugzilla.set_token(args.token) bug_ids = bugzilla.get_ids_between(date.today() - timedelta(days=args.days_back)) bugs = bugzilla.get(bug_ids) # Get untriaged bugs untriaged_bugs = [] for bug in bugs.values(): for history in bug["history"]: for change in history["changes"]: if (change["field_name"] == "component" and change["removed"] == "Untriaged"): untriaged_bugs.append(bug) with open("bugs-{}.json".format(datetime.now().strftime("%s")), "w") as f: json.dump(untriaged_bugs, f) return untriaged_bugs
def generate_sheet(model_name, token): model_file_name = f"{model_name}model" assert os.path.exists( model_file_name ), f"{model_file_name} does not exist. Train the model with trainer.py first." model_class = get_model_class(model_name) model = model_class.load(model_file_name) today = datetime.utcnow() a_week_ago = today - timedelta(7) bugzilla.set_token(token) bug_ids = bugzilla.get_ids_between(a_week_ago, today) bugs = bugzilla.get(bug_ids) print(f"Classifying {len(bugs)} bugs...") rows = [["Bug", f"{model_name}(model)", model_name, "Title"]] for bug in bugs.values(): p = model.classify(bug, probabilities=True) rows.append([ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}', "y" if p[0][1] >= 0.7 else "n", "", bug["summary"], ]) os.makedirs("sheets", exist_ok=True) with open( os.path.join( "sheets", f'{model_name}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv', ), "w", ) as f: writer = csv.writer(f) writer.writerows(rows)
from bugbug import bugzilla from bugbug.models.duplicate import DuplicateModel m = DuplicateModel.load("duplicatemodel") REPORTERS_TO_IGNORE = {"*****@*****.**", "*****@*****.**"} try: with open("duplicate_test_bugs.json", "r") as f: test_bugs = json.load(f) except FileNotFoundError: test_bug_ids = bugzilla.get_ids_between( datetime.now() - timedelta(days=21), datetime.now() ) test_bugs = bugzilla.get(test_bug_ids) test_bugs = [ bug for bug in test_bugs.values() if not bug["creator"] in REPORTERS_TO_IGNORE ] with open("duplicate_test_bugs.json", "w") as f: json.dump(test_bugs, f) bug_tuples = list(itertools.combinations(test_bugs, 2)) # Warning: Classifying all the test bugs takes a while probs = m.classify(bug_tuples, probabilities=True) with open("duplicate_predictions.csv", "w") as csvfile: spamwriter = csv.writer(csvfile) spamwriter.writerow(
def main(args): model_file_name = "{}{}model".format( args.goal, "" if args.classifier == "default" else args.classifier ) if args.goal == "component": if args.classifier == "default": model_class_name = "component" else: model_class_name = "component_nn" else: model_class_name = args.goal model_class = get_model_class(model_class_name) if args.train: db.download(bugzilla.BUGS_DB) db.download(repository.COMMITS_DB) historical_supported_tasks = [ "defect", "bugtype", "defectenhancementtask", "regression", ] if args.goal in historical_supported_tasks: model = model_class(args.lemmatization, args.historical) elif args.goal == "duplicate": model = model_class(args.training_set_size, args.lemmatization) else: model = model_class(args.lemmatization) model.train() else: model = model_class.load(model_file_name) if args.classify: for bug in bugzilla.get_bugs(): print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify( bug, probabilities=True, importances=True ) feature_names = model.get_human_readable_feature_names() for i, (importance, index, is_positive) in enumerate( importance["importances"] ): print( f'{i + 1}. \'{feature_names[int(index)]}\' ({"+" if (is_positive) else "-"}{importance})' ) else: probas = model.classify(bug, probabilities=True, importances=False) if np.argmax(probas) == 1: print(f"Positive! {probas}") else: print(f"Negative! {probas}") input() if args.generate_sheet: assert ( args.token is not None ), "A Bugzilla token should be set in order to download bugs" today = datetime.utcnow() a_week_ago = today - timedelta(7) bugzilla.set_token(args.token) bug_ids = bugzilla.get_ids_between(a_week_ago, today) bugs = bugzilla.get(bug_ids) print(f"Classifying {len(bugs)} bugs...") rows = [["Bug", f"{args.goal}(model)", args.goal, "Title"]] for bug in bugs.values(): p = model.classify(bug, probabilities=True) rows.append( [ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}', "y" if p[0][1] >= 0.7 else "n", "", bug["summary"], ] ) os.makedirs("sheets", exist_ok=True) with open( os.path.join( "sheets", f'{args.goal}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv', ), "w", ) as f: writer = csv.writer(f) writer.writerows(rows)