def fetch_untriaged(args): from bugbug import bugzilla today = date.today() three_months_ago = today - timedelta(days=args.days_back) # Set bugzilla token and download bugs bugzilla.set_token(args.token) bug_ids = bugzilla.download_bugs_between(three_months_ago, today) # Get untriaged bugs bugs = bugzilla.get_bugs() untriaged_bugs = [] for bug in bugs: if bug["id"] not in bug_ids: continue for history in bug["history"]: for change in history["changes"]: if (change["field_name"] == "component" and change["removed"] == "Untriaged"): untriaged_bugs.append(bug) with open("bugs-{}.json".format(datetime.now().strftime("%s")), "w") as f: json.dump(untriaged_bugs, f) return untriaged_bugs
def retrieve_bugs(self): bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN]) six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info('Downloading bugs from {} to {}'.format( two_years_and_six_months_ago, six_months_ago)) bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago) logger.info('Downloading labelled bugs') bug_ids = labels.get_all_bug_ids() bugzilla.download_bugs(bug_ids) # Try to re-download inconsistent bugs, up to three times. for i in range(3): bug_ids = bug_snapshot.get_inconsistencies() if len(bug_ids) == 0: break logger.info( f'Re-downloading {len(bug_ids)} bugs, as they were inconsistent' ) bugzilla.delete_bugs(bug_ids) bugzilla.download_bugs(bug_ids) self.compress_file('data/bugs.json')
def __init__(self, repo_dir: str) -> None: repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = download_and_load_model("regressor") bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN"))
def fetch_untriaged(args): from bugbug import bugzilla today = date.today() three_months_ago = today - timedelta(days=args.days_back) # Set bugzilla token and download bugs bugzilla.set_token(args.token) bug_ids = bugzilla.download_bugs_between(three_months_ago, today) # Get untriaged bugs bugs = bugzilla.get_bugs() untriaged_bugs = [] for bug in bugs: if bug['id'] not in bug_ids: continue for history in bug['history']: for change in history['changes']: if change['field_name'] == 'component' and change['removed'] == 'Untriaged': untriaged_bugs.append(bug) with open('bugs-{}.json'.format(datetime.now().strftime('%s')), 'w') as f: json.dump(untriaged_bugs, f) return untriaged_bugs
def retrieve_bugs(self): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( "Downloading bugs from {} to {}".format( two_years_and_six_months_ago, six_months_ago ) ) bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago) logger.info("Downloading labelled bugs") bug_ids = labels.get_all_bug_ids() bugzilla.download_bugs(bug_ids) # Try to re-download inconsistent bugs, up to three times. for i in range(3): bug_ids = bug_snapshot.get_inconsistencies() if len(bug_ids) == 0: break logger.info( f"Re-downloading {len(bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(bug_ids) bugzilla.download_bugs(bug_ids) self.compress_file("data/bugs.json")
def classify_bug(model_name, bug_ids, bugzilla_token, expiration=DEFAULT_EXPIRATION_TTL): # This should be called in a process worker so it should be safe to set # the token here bug_ids_set = set(map(int, bug_ids)) bugzilla.set_token(bugzilla_token) bugs = bugzilla.get(bug_ids) redis_url = os.environ.get("REDIS_URL", "redis://localhost/0") redis = Redis.from_url(redis_url) missing_bugs = bug_ids_set.difference(bugs.keys()) for bug_id in missing_bugs: redis_key = f"result_{model_name}_{bug_id}" # TODO: Find a better error format encoded_data = json.dumps({"available": False}) redis.set(redis_key, encoded_data) redis.expire(redis_key, expiration) if not bugs: return "NOK" # TODO: Cache the model in the process memory, it's quite hard as the RQ # worker is forking before starting model = load_model(model_name) model_extra_data = model.get_extra_data() # TODO: Classify could choke on a single bug which could make the whole # job to fails. What should we do here? probs = model.classify(list(bugs.values()), True) indexes = probs.argmax(axis=-1) suggestions = model.clf._le.inverse_transform(indexes) probs_list = probs.tolist() indexes_list = indexes.tolist() suggestions_list = suggestions.tolist() for i, bug_id in enumerate(bugs.keys()): data = { "prob": probs_list[i], "index": indexes_list[i], "suggestion": suggestions_list[i], "extra_data": model_extra_data, } encoded_data = json.dumps(data) redis_key = f"result_{model_name}_{bug_id}" redis.set(redis_key, encoded_data) redis.expire(redis_key, expiration) return "OK"
def classify_bug(model_name, bug_ids, bugzilla_token): from bugbug_http.app import JobInfo # This should be called in a process worker so it should be safe to set # the token here bug_ids_set = set(map(int, bug_ids)) bugzilla.set_token(bugzilla_token) bugs = bugzilla.get(bug_ids) missing_bugs = bug_ids_set.difference(bugs.keys()) for bug_id in missing_bugs: job = JobInfo(classify_bug, model_name, bug_id) # TODO: Find a better error format encoded_data = json.dumps({"available": False}) setkey(job.result_key, encoded_data) if not bugs: return "NOK" model = get_model(model_name) if not model: LOGGER.info("Missing model %r, aborting" % model_name) return "NOK" model_extra_data = model.get_extra_data() # TODO: Classify could choke on a single bug which could make the whole # job to fails. What should we do here? probs = model.classify(list(bugs.values()), True) indexes = probs.argmax(axis=-1) suggestions = model.le.inverse_transform(indexes) probs_list = probs.tolist() indexes_list = indexes.tolist() suggestions_list = suggestions.tolist() for i, bug_id in enumerate(bugs.keys()): data = { "prob": probs_list[i], "index": indexes_list[i], "class": suggestions_list[i], "extra_data": model_extra_data, } encoded_data = json.dumps(data) job = JobInfo(classify_bug, model_name, bug_id) setkey(job.result_key, encoded_data) # Save the bug last change setkey(job.change_time_key, bugs[bug_id]["last_change_time"], expiration=0) return "OK"
def classify_bug(model_name: str, bug_ids: Sequence[int], bugzilla_token: str) -> str: from bugbug_http.app import JobInfo # This should be called in a process worker so it should be safe to set # the token here bug_ids_set = set(map(int, bug_ids)) bugzilla.set_token(bugzilla_token) bugs = {} for i in range(0, len(bug_ids), Bugzilla.BUGZILLA_CHUNK_SIZE): bugs.update(bugzilla.get(bug_ids[i:(i + Bugzilla.BUGZILLA_CHUNK_SIZE)])) missing_bugs = bug_ids_set.difference(bugs.keys()) for bug_id in missing_bugs: job = JobInfo(classify_bug, model_name, bug_id) # TODO: Find a better error format setkey(job.result_key, orjson.dumps({"available": False})) if not bugs: return "NOK" model = MODEL_CACHE.get(model_name) if not model: LOGGER.info("Missing model %r, aborting" % model_name) return "NOK" model_extra_data = model.get_extra_data() # TODO: Classify could choke on a single bug which could make the whole # job to fails. What should we do here? probs = model.classify(list(bugs.values()), True) indexes = probs.argmax(axis=-1) suggestions = model.le.inverse_transform(indexes) probs_list = probs.tolist() indexes_list = indexes.tolist() suggestions_list = suggestions.tolist() for i, bug_id in enumerate(bugs.keys()): data = { "prob": probs_list[i], "index": indexes_list[i], "class": suggestions_list[i], "extra_data": model_extra_data, } job = JobInfo(classify_bug, model_name, bug_id) setkey(job.result_key, orjson.dumps(data), compress=True) # Save the bug last change setkey(job.change_time_key, bugs[bug_id]["last_change_time"].encode()) return "OK"
def retrieve_bugs(self): bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN]) six_months_ago = datetime.utcnow() - timedelta(182) two_years_and_six_months_ago = six_months_ago - timedelta(365) bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago) bug_ids = labels.get_all_bug_ids() bugzilla.download_bugs(bug_ids) self.compress_file('data/bugs.json')
def go(self): # Download models that were trained by bugbug_train. with ThreadPoolExecutorResult(max_workers=3) as executor: f1 = executor.submit(lambda: urlretrieve( 'https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/bug.model.xz', 'bug.model.xz')) # noqa f1.add_done_callback(lambda f: self.decompress_file('bug.model')) f2 = executor.submit(lambda: urlretrieve( 'https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/regression.model.xz', 'regression.model.xz')) # noqa f2.add_done_callback( lambda f: self.decompress_file('regression.model')) f3 = executor.submit(lambda: urlretrieve( 'https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/tracking.model.xz', 'tracking.model.xz')) # noqa f3.add_done_callback( lambda f: self.decompress_file('tracking.model')) # Download bugs from the last week that we want to analyze. bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN]) today = datetime.utcnow() one_week_ago = today - timedelta(7) bugzilla.download_bugs_between(one_week_ago, today) # Eval classifier for bug-vs-nonbug. self.eval_bug() # Eval classifier for regression-vs-nonregression. self.eval_regression() # Eval classifier for tracking bugs. self.eval_tracking() # Index the task in the TaskCluster index. self.index_service.insertTask( 'project.releng.services.project.{}.bugbug_eval.latest'.format( secrets[secrets.APP_CHANNEL]), { 'taskId': os.environ['TASK_ID'], 'rank': 0, 'data': {}, 'expires': (datetime.utcnow() + timedelta(31)).strftime('%Y-%m-%dT%H:%M:%S.%fZ'), })
def retrieve_bugs(self): bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN]) six_months_ago = datetime.utcnow() - timedelta(182) two_years_and_six_months_ago = six_months_ago - timedelta(365) logger.info('Downloading bugs from {} to {}'.format( two_years_and_six_months_ago, six_months_ago)) bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago) logger.info('Downloading labelled bugs') bug_ids = labels.get_all_bug_ids() bugzilla.download_bugs(bug_ids) self.compress_file('data/bugs.json')
def generate_sheet(model_name, token, days, threshold): model_file_name = f"{model_name}model" assert os.path.exists( model_file_name ), f"{model_file_name} does not exist. Train the model with trainer.py first." model_class = get_model_class(model_name) model = model_class.load(model_file_name) today = datetime.utcnow() start_date = today - timedelta(days) bugzilla.set_token(token) bug_ids = bugzilla.get_ids_between(start_date, today) bugs = bugzilla.get(bug_ids) print(f"Classifying {len(bugs)} bugs...") rows = [["Bug", f"{model_name}(model)", model_name, "Title"]] for bug in bugs.values(): p = model.classify(bug, probabilities=True) probability = p[0] if len(probability) > 2: index = np.argmax(probability) prediction = model.class_names[index] else: prediction = "y" if probability[1] >= threshold else "n" rows.append( [ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}', prediction, "", bug["summary"], ] ) os.makedirs("sheets", exist_ok=True) with open( os.path.join( "sheets", f'{model_name}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv', ), "w", ) as f: writer = csv.writer(f) writer.writerows(rows)
def get_bugs_last_change_time(bug_ids): bugzilla.set_token(BUGZILLA_TOKEN) old_CHUNK_SIZE = Bugzilla.BUGZILLA_CHUNK_SIZE try: Bugzilla.BUGZILLA_CHUNK_SIZE = 700 bugs = {} def bughandler(bug): bugs[bug["id"]] = bug["last_change_time"] Bugzilla( bugids=bug_ids, bughandler=bughandler, include_fields=["id", "last_change_time"], ).get_data().wait() finally: Bugzilla.BUGZILLA_CHUNK_SIZE = old_CHUNK_SIZE return bugs
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( ( parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";") ), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor")) ) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key( get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN") )
def fetch_untriaged(args): from bugbug import bugzilla # Set bugzilla token and download bugs bugzilla.set_token(args.token) bug_ids = bugzilla.get_ids_between(date.today() - timedelta(days=args.days_back)) bugs = bugzilla.get(bug_ids) # Get untriaged bugs untriaged_bugs = [] for bug in bugs.values(): for history in bug["history"]: for change in history["changes"]: if (change["field_name"] == "component" and change["removed"] == "Untriaged"): untriaged_bugs.append(bug) with open("bugs-{}.json".format(datetime.now().strftime("%s")), "w") as f: json.dump(untriaged_bugs, f) return untriaged_bugs
def generate_sheet(model_name, token): model_file_name = f"{model_name}model" assert os.path.exists( model_file_name ), f"{model_file_name} does not exist. Train the model with trainer.py first." model_class = get_model_class(model_name) model = model_class.load(model_file_name) today = datetime.utcnow() a_week_ago = today - timedelta(7) bugzilla.set_token(token) bug_ids = bugzilla.get_ids_between(a_week_ago, today) bugs = bugzilla.get(bug_ids) print(f"Classifying {len(bugs)} bugs...") rows = [["Bug", f"{model_name}(model)", model_name, "Title"]] for bug in bugs.values(): p = model.classify(bug, probabilities=True) rows.append([ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}', "y" if p[0][1] >= 0.7 else "n", "", bug["summary"], ]) os.makedirs("sheets", exist_ok=True) with open( os.path.join( "sheets", f'{model_name}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv', ), "w", ) as f: writer = csv.writer(f) writer.writerows(rows)
def go(self): # Download models that were trained by bugbug_train. with ThreadPoolExecutorResult(max_workers=3) as executor: f1 = executor.submit(lambda: urlretrieve('https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/bugmodel.xz', 'bugmodel.xz')) # noqa f1.add_done_callback(lambda f: self.decompress_file('bugmodel')) f2 = executor.submit(lambda: urlretrieve('https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/regressionmodel.xz', 'regressionmodel.xz')) # noqa f2.add_done_callback(lambda f: self.decompress_file('regressionmodel')) f3 = executor.submit(lambda: urlretrieve('https://index.taskcluster.net/v1/task/project.releng.services.project.testing.bugbug_train.latest/artifacts/public/trackingmodel.xz', 'trackingmodel.xz')) # noqa f3.add_done_callback(lambda f: self.decompress_file('trackingmodel')) # Download bugs from the last week that we want to analyze. bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN]) today = datetime.utcnow() one_week_ago = today - timedelta(7) bugzilla.download_bugs_between(one_week_ago, today) # Eval classifier for bug-vs-nonbug. self.eval_bug() # Eval classifier for regression-vs-nonregression. self.eval_regression() # Eval classifier for tracking bugs. self.eval_tracking() # Index the task in the TaskCluster index. self.index_service.insertTask( 'project.releng.services.project.{}.bugbug_eval.latest'.format(secrets[secrets.APP_CHANNEL]), { 'taskId': os.environ['TASK_ID'], 'rank': 0, 'data': {}, 'expires': (datetime.utcnow() + timedelta(31)).strftime('%Y-%m-%dT%H:%M:%S.%fZ'), } )
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( (parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) # Some commits that were already in the DB from the previous run might need # to be updated (e.g. coverage information). repository.update_commits() logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor"))) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")) self.path_to_component = repository.get_component_mapping() self.past_regressions_by = {} self.past_fixed_bugs_by = {} self.past_regression_blocked_bugs_by = {} self.past_fixed_bug_blocked_bugs_by = {} for dimension in ["component", "directory", "file", "function"]: self.past_regressions_by[dimension] = _download_past_bugs( PAST_REGRESSIONS_BY_URL.format(dimension=dimension)) self.past_fixed_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUGS_BY_URL.format(dimension=dimension)) self.past_regression_blocked_bugs_by[ dimension] = _download_past_bugs( PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format( dimension=dimension)) self.past_fixed_bug_blocked_bugs_by[ dimension] = _download_past_bugs( PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format( dimension=dimension))
def main(args): model_file_name = "{}{}model".format( args.goal, "" if args.classifier == "default" else args.classifier) if args.goal == "component": if args.classifier == "default": model_class_name = "component" else: model_class_name = "component_nn" else: model_class_name = args.goal model_class = get_model_class(model_class_name) if args.train: db.download(bugzilla.BUGS_DB) db.download(repository.COMMITS_DB) historical_supported_tasks = [ "defect", "bugtype", "defectenhancementtask", "regression", ] if args.goal in historical_supported_tasks: model = model_class(args.lemmatization, args.historical) elif args.goal == "duplicate": model = model_class(args.training_set_size, args.lemmatization) else: model = model_class(args.lemmatization) model.train() else: model = model_class.load(model_file_name) if args.classify: for bug in bugzilla.get_bugs(): print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify(bug, probabilities=True, importances=True) feature_names = model.get_feature_names() for i, (importance, index, is_positive) in enumerate(importance["importances"]): print( f'{i + 1}. \'{feature_names[int(index)]}\' ({"+" if (is_positive) else "-"}{importance})' ) else: probas = model.classify(bug, probabilities=True, importances=False) if np.argmax(probas) == 1: print(f"Positive! {probas}") else: print(f"Negative! {probas}") input() if args.generate_sheet: assert (args.token is not None ), "A Bugzilla token should be set in order to download bugs" today = datetime.utcnow() a_week_ago = today - timedelta(7) bugzilla.set_token(args.token) bugs = bugzilla.download_bugs_between(a_week_ago, today) print(f"Classifying {len(bugs)} bugs...") rows = [["Bug", f"{args.goal}(model)", args.goal, "Title"]] for bug in bugs: p = model.classify(bug, probabilities=True) rows.append([ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}', "y" if p[0][1] >= 0.7 else "n", "", bug["summary"], ]) os.makedirs("sheets", exist_ok=True) with open( os.path.join( "sheets", f'{args.goal}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv', ), "w", ) as f: writer = csv.writer(f) writer.writerows(rows)
else: probas = model.classify(bug, probabilities=True, importances=False) if np.argmax(probas) == 1: print(f"Positive! {probas}") else: print(f"Negative! {probas}") input() if args.generate_sheet: assert ( args.token is not None ), "A Bugzilla token should be set in order to download bugs" today = datetime.utcnow() a_week_ago = today - timedelta(7) bugzilla.set_token(args.token) bugs = bugzilla.download_bugs_between(a_week_ago, today) print(f"Classifying {len(bugs)} bugs...") rows = [["Bug", f"{args.goal}(model)", args.goal, "Title"]] for bug in bugs: p = model.classify(bug, probabilities=True) rows.append( [ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}', "y" if p[0][1] >= 0.7 else "n", "", bug["summary"], ]
def retrieve_bugs(self, limit: int = None) -> None: bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = set( bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() })) logger.info(f"Retrieved {len(changed_ids)} IDs.") all_components = bugzilla.get_product_component_count(9999) deleted_component_ids = set( bug["id"] for bug in bugzilla.get_bugs() if "{}::{}".format( bug["product"], bug["component"]) not in all_components) logger.info( f"{len(deleted_component_ids)} bugs belonging to deleted components" ) changed_ids |= deleted_component_ids # Get IDs of bugs between (two years and six months ago) and now. two_years_and_six_months_ago = datetime.utcnow() - relativedelta( years=2, months=6) logger.info(f"Retrieving bug IDs since {two_years_and_six_months_ago}") timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago) if limit: timespan_ids = timespan_ids[-limit:] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[-limit:] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=3) commit_bug_ids = list( set(commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date)) if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info( f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which are regressions, bugs which caused regressions (useful for the regressor model), # and blocked bugs. regression_related_ids: List[int] = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in bugzilla.get_bugs()), [], ))) if limit: regression_related_ids = regression_related_ids[-limit:] logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) # Get IDs of bugs linked to intermittent failures. test_failure_bug_ids = [ item["bug_id"] for item in test_scheduling.get_failure_bugs( two_years_and_six_months_ago, datetime.utcnow()) ] if limit: test_failure_bug_ids = test_failure_bug_ids[-limit:] logger.info(f"{len(test_failure_bug_ids)} bugs about test failures.") all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids + regression_related_ids + test_failure_bug_ids) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set) new_bugs = bugzilla.download_bugs(all_ids) # Get regression_related_ids again (the set could have changed after downloading new bugs). for i in range(7): regression_related_ids = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in new_bugs), [], ))) logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) if limit: regression_related_ids = regression_related_ids[-limit:] # If we got all bugs we needed, break. if set(regression_related_ids).issubset(all_ids): break new_bugs = bugzilla.download_bugs(regression_related_ids) # Try to re-download inconsistent bugs, up to twice. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(2): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) # TODO: Figure out why. missing_history_bug_ids = { bug["id"] for bug in bugzilla.get_bugs() if "history" not in bug } bugzilla.delete_bugs(lambda bug: bug["id"] in missing_history_bug_ids) logger.info( f"Deleted {len(missing_history_bug_ids)} bugs as we couldn't retrieve their history" ) zstd_compress(bugzilla.BUGS_DB)
def retrieve_bugs(self, limit=None): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids( {"f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date()} ) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between( two_years_and_six_months_ago, six_months_ago ) if limit: timespan_ids = timespan_ids[:limit] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[:limit] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=3) commit_bug_ids = [ commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date ] if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which are regressions and bugs which caused regressions (useful for the regressor model). regressed_by_bug_ids = sum( (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()), [], ) if limit: regressed_by_bug_ids = regressed_by_bug_ids[-limit:] logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) all_ids = ( timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids ) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs( lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set ) bugzilla.download_bugs(all_ids) # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs). regressed_by_bug_ids = sum( (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()), [], ) logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) bugzilla.download_bugs(regressed_by_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies(inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) zstd_compress("data/bugs.json")
probas = model.classify(bug, probabilities=True, importances=False) if np.argmax(probas) == 1: print(f"Positive! {probas}") else: print(f"Negative! {probas}") input() if args.generate_sheet: assert (args.token is not None ), "A Bugzilla token should be set in order to download bugs" today = datetime.utcnow() a_week_ago = today - timedelta(7) bugzilla.set_token(args.token) bugs = bugzilla.download_bugs_between(a_week_ago, today) print(f"Classifying {len(bugs)} bugs...") rows = [["Bug", f"{args.goal}(model)", args.goal, "Title"]] for bug in bugs: p = model.classify(bug, probabilities=True) rows.append([ f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}', "y" if p[0][1] >= 0.7 else "n", "", bug["summary"], ])
def retrieve_bugs(self): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download_version(bugzilla.BUGS_DB) if not db.is_old_version(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() }) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago, six_months_ago) logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") all_ids = set(timespan_ids + labelled_bug_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs( lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids) bugzilla.download_bugs(timespan_ids + labelled_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs() for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) self.compress_file("data/bugs.json")
# You can obtain one at http://mozilla.org/MPL/2.0/. import logging import os from flask import Flask, current_app, jsonify, request from bugbug import bugzilla from .models import load_model API_TOKEN = "X-Api-Key" application = Flask(__name__) bugzilla.set_token(os.environ.get("BUGBUG_BUGZILLA_TOKEN")) logging.basicConfig(level=logging.INFO) LOGGER = logging.getLogger() def get_model(model): attribute = f"bugbug_model_{model}" if not hasattr(current_app, attribute): model = load_model(model) setattr(current_app, attribute, model) return getattr(current_app, attribute, model)