def check(self): success = super().check() # Get the number of bugs per full component to fasten up the check bugs_number = get_product_component_count() # Check number 1, check that the most meaningful product components # still have at least a bug in this component. If the check is failing # that could mean that: # - A component has been renamed / removed # - A component is not used anymore by developers for product, component in self.meaningful_product_components: full_comp = f"{product}::{component}" if full_comp not in bugs_number.keys(): print( f"Component {component!r} of product {product!r} doesn't exists, failure" ) success = False elif bugs_number[full_comp] <= 0: print( f"Component {component!r} of product {product!r} have 0 bugs or less in it, failure" ) success = False # Check number 2, check that conflated components in # CONFLATED_COMPONENTS match at least one component which has more # than 0 bugs for conflated_component in self.CONFLATED_COMPONENTS: matching_components = [ full_comp for full_comp in bugs_number.keys() if full_comp.startswith(conflated_component) ] if not matching_components: print(f"{conflated_component} doesn't match any component") success = False continue matching_components_values = [ bugs_number[full_comp] for full_comp in matching_components if bugs_number[full_comp] > 0 ] if not matching_components_values: print( f"{conflated_component} should match at least one component with more than 0 bugs" ) success = False # Check number 3, check that values of CONFLATED_COMPONENTS_MAPPING # still exist as components and have more than 0 bugs for full_comp in self.CONFLATED_COMPONENTS_MAPPING.values(): if full_comp not in bugs_number: print( f"{full_comp} from conflated component mapping doesn't exists, failure" ) success = False elif bugs_number[full_comp] <= 0: print( f"{full_comp} from conflated component mapping have less than 1 bug, failure" ) success = False # Check number 4, conflated components in CONFLATED_COMPONENTS either # exist as components or are in CONFLATED_COMPONENTS_MAPPING for conflated_component in self.CONFLATED_COMPONENTS: in_mapping = conflated_component in self.CONFLATED_COMPONENTS_MAPPING matching_components = [ full_comp for full_comp in bugs_number.keys() if full_comp.startswith(conflated_component) ] if not (matching_components or in_mapping): print(f"It should be possible to map {conflated_component}") success = False continue # Check number 5, there is no component with many bugs that is not in # meaningful_product_components # Recompute the meaningful components def generate_meaningful_tuples(): for full_comp, count in bugs_number.items(): product, component = full_comp.split("::", 1) if not self.is_meaningful(product, component): continue if count > 0: for i in range(count): yield (product, component) meaningful_product_components = self.get_meaningful_product_components( generate_meaningful_tuples(), threshold_ratio=10 ) if not meaningful_product_components.issubset( self.meaningful_product_components ): print(f"Meaningful product components mismatch") new_meaningful_product_components = meaningful_product_components.difference( self.meaningful_product_components ) print( f"New meaningful product components {new_meaningful_product_components!r}" ) success = False return success
def check(self): success = super().check() # Get the number of bugs per full component to fasten up the check bugs_number = get_product_component_count() # Check number 1, check that the most meaningful product components # still have at least a bug in this component. If the check is failing # that could mean that: # - A component has been renamed / removed # - A component is not used anymore by developers for product, component in self.meaningful_product_components: full_comp = f"{product}::{component}" if full_comp not in bugs_number.keys(): print( f"Component {component!r} of product {product!r} doesn't exists, failure" ) success = False elif bugs_number[full_comp] <= 0: print( f"Component {component!r} of product {product!r} have 0 bugs or less in it, failure" ) success = False # Check number 2, check that conflated components in # CONFLATED_COMPONENTS match at least one component which has more # than 0 bugs for conflated_component in self.CONFLATED_COMPONENTS: matching_components = [ full_comp for full_comp in bugs_number.keys() if full_comp.startswith(conflated_component) ] if not matching_components: print(f"{conflated_component} doesn't match any component") success = False continue matching_components_values = [ bugs_number[full_comp] for full_comp in matching_components if bugs_number[full_comp] > 0 ] if not matching_components_values: print( f"{conflated_component} should match at least one component with more than 0 bugs" ) success = False # Check number 3, check that values of CONFLATED_COMPONENTS_MAPPING # still exist as components and have more than 0 bugs for full_comp in self.CONFLATED_COMPONENTS_MAPPING.values(): if full_comp not in bugs_number: print( f"{full_comp} from conflated component mapping doesn't exists, failure" ) success = False elif bugs_number[full_comp] <= 0: print( f"{full_comp} from conflated component mapping have less than 1 bug, failure" ) success = False # Check number 4, conflated components in CONFLATED_COMPONENTS either # exist as components or are in CONFLATED_COMPONENTS_MAPPING for conflated_component in self.CONFLATED_COMPONENTS: in_mapping = conflated_component in self.CONFLATED_COMPONENTS_MAPPING matching_components = [ full_comp for full_comp in bugs_number.keys() if full_comp.startswith(conflated_component) ] if not (matching_components or in_mapping): print(f"It should be possible to map {conflated_component}") success = False continue # Check number 5, there is no component with many bugs that is not in # meaningful_product_components # Recompute the meaningful components def generate_meaningful_tuples(): for full_comp, count in bugs_number.items(): product, component = full_comp.split("::", 1) if not self.is_meaningful(product, component): continue if count > 0: for i in range(count): yield (product, component) meaningful_product_components = self.get_meaningful_product_components( generate_meaningful_tuples(), threshold_ratio=10 ) if not meaningful_product_components.issubset( self.meaningful_product_components ): print("Meaningful product components mismatch") new_meaningful_product_components = meaningful_product_components.difference( self.meaningful_product_components ) print( f"New meaningful product components {new_meaningful_product_components!r}" ) success = False return success
def retrieve_bugs(self, limit: int = None) -> None: bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = set( bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() })) logger.info(f"Retrieved {len(changed_ids)} IDs.") all_components = bugzilla.get_product_component_count(9999) deleted_component_ids = set( bug["id"] for bug in bugzilla.get_bugs() if "{}::{}".format( bug["product"], bug["component"]) not in all_components) logger.info( f"{len(deleted_component_ids)} bugs belonging to deleted components" ) changed_ids |= deleted_component_ids # Get IDs of bugs between (two years and six months ago) and now. two_years_and_six_months_ago = datetime.utcnow() - relativedelta( years=2, months=6) logger.info(f"Retrieving bug IDs since {two_years_and_six_months_ago}") timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago) if limit: timespan_ids = timespan_ids[-limit:] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[-limit:] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=3) commit_bug_ids = list( set(commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date)) if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info( f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which are regressions, bugs which caused regressions (useful for the regressor model), # and blocked bugs. regression_related_ids: List[int] = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in bugzilla.get_bugs()), [], ))) if limit: regression_related_ids = regression_related_ids[-limit:] logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) # Get IDs of bugs linked to intermittent failures. test_failure_bug_ids = [ item["bug_id"] for item in test_scheduling.get_failure_bugs( two_years_and_six_months_ago, datetime.utcnow()) ] if limit: test_failure_bug_ids = test_failure_bug_ids[-limit:] logger.info(f"{len(test_failure_bug_ids)} bugs about test failures.") all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids + regression_related_ids + test_failure_bug_ids) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set) new_bugs = bugzilla.download_bugs(all_ids) # Get regression_related_ids again (the set could have changed after downloading new bugs). for i in range(7): regression_related_ids = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in new_bugs), [], ))) logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) if limit: regression_related_ids = regression_related_ids[-limit:] # If we got all bugs we needed, break. if set(regression_related_ids).issubset(all_ids): break new_bugs = bugzilla.download_bugs(regression_related_ids) # Try to re-download inconsistent bugs, up to twice. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(2): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) # TODO: Figure out why. missing_history_bug_ids = { bug["id"] for bug in bugzilla.get_bugs() if "history" not in bug } bugzilla.delete_bugs(lambda bug: bug["id"] in missing_history_bug_ids) logger.info( f"Deleted {len(missing_history_bug_ids)} bugs as we couldn't retrieve their history" ) zstd_compress(bugzilla.BUGS_DB)