def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization, commit_data=True) self.cross_validation_enabled = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords({"dev-doc-needed", "dev-doc-complete"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.product(), bug_features.component(), bug_features.commit_added(), bug_features.commit_deleted(), bug_features.commit_types(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, commit_data=True, ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(), "title"), ("comments", self.text_vectorizer(), "comments"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): IssueModel.__init__(self, lemmatization) feature_extractors = [ issue_features.comment_count(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "issue_extractor", issue_features.IssueExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.0001), "first_comment", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, historical=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords(set(KEYWORD_DICT.keys())), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ]), ), ]) self.clf = OneVsRestClassifier( xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()))
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.whiteboard(), bug_features.product(), # TODO: We would like to use the component at the time of filing too, # but we can't because the rollback script doesn't support changes to # components yet. # bug_features.component(), bug_features.num_words_title(), bug_features.num_words_comments(), bug_features.keywords(), bug_features.priority(), bug_features.version(), bug_features.target_milestone(), bug_features.has_attachment(), bug_features.platform(), bug_features.op_sys(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization: bool = False) -> None: CommitModel.__init__(self, lemmatization) self.calculate_importance = False self.training_dbs += [bugzilla.BUGS_DB] self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.reviewers_num(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.functions_touched_num(), commit_features.functions_touched_size(), commit_features.source_code_file_metrics(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(min_df=0.0001), "desc"), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, training_size=14000, lemmatization=False, cleanup_urls=True): self.num_duplicates = training_size // 2 self.num_nondups_nondups = self.num_dup_nondups = training_size // 4 BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [ bug_features.is_same_product(), bug_features.is_same_component(), bug_features.is_same_platform(), bug_features.is_same_version(), bug_features.is_same_os(), bug_features.is_same_target_milestone(), bug_features.is_first_affected_same(), bug_features.couple_common_words_comments(), bug_features.couple_delta_creation_date(), bug_features.couple_common_keywords(), bug_features.couple_common_whiteboard_keywords(), bug_features.couple_common_words_summary(), ] cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: cleanup_functions.append(feature_cleanup.url()) self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True), ), ( "union", ColumnTransformer([ ("text", self.text_vectorizer(), "text"), ("couple_data", DictVectorizer(), "couple_data"), ]), ), ]) self.clf = XGBClassifier(n_jobs=utils.get_physical_cpu_count())
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.cross_validation_enabled = False self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor") self.CONFLATED_COMPONENTS_INVERSE_MAPPING = { v: k for k, v in self.CONFLATED_COMPONENTS_MAPPING.items() }
def __init__(self, lemmatization=False): CommitModel.__init__(self, lemmatization) self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), # commit_features.author_experience(), # commit_features.reviewer_experience(), commit_features.reviewers_num(), # commit_features.component_touched_prev(), # commit_features.directory_touched_prev(), # commit_features.file_touched_prev(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords({"stepswanted"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(), "title"), ("comments", self.text_vectorizer(), "comments"), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): IssueModel.__init__(self, owner="webcompat", repo="web-bugs", lemmatization=lemmatization) self.calculate_importance = False feature_extractors = [] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "issue_extractor", issue_features.IssueExtractor(feature_extractors, cleanup_functions, rollback=True), ), ( "union", ColumnTransformer([ ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.0001), "first_comment", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, bug_data=False): CommitModel.__init__(self, lemmatization, bug_data) self.calculate_importance = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.components(), commit_features.directories(), commit_features.files(), ] if bug_data: feature_extractors += [ bug_features.product(), bug_features.component(), bug_features.severity(), bug_features.priority(), bug_features.has_crash_signature(), bug_features.has_regression_range(), bug_features.whiteboard(), bug_features.keywords(), bug_features.number_of_bug_dependencies(), bug_features.blocked_bugs_number(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(), "desc"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, interpretable=False): CommitModel.__init__(self, lemmatization) self.training_dbs.append(BUG_INTRODUCING_COMMITS_DB) self.store_dataset = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.functions_touched_num(), commit_features.functions_touched_size(), commit_features.source_code_file_metrics(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] column_transformers = [("data", DictVectorizer(), "data")] if not interpretable: column_transformers.append( ("desc", self.text_vectorizer(min_df=0.0001), "desc")) self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, cleanup_functions), ), ("union", ColumnTransformer(column_transformers)), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, granularity="label", failures_skip=None): Model.__init__(self, lemmatization) self.granularity = granularity self.failures_skip = failures_skip self.training_dbs = [repository.COMMITS_DB] self.eval_dbs[repository.COMMITS_DB] = ( repository.COMMITS_DB, repository.COMMIT_EXPERIENCES_DB, ) if granularity == "label": self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_LABEL_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_LABEL_DB, test_scheduling.FAILING_TOGETHER_LABEL_DB, ) elif granularity == "group": self.training_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_GROUP_DB, test_scheduling.TOUCHED_TOGETHER_DB, test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB, ) elif granularity == "config_group": self.training_dbs.append( test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_CONFIG_GROUP_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) self.cross_validation_enabled = False self.entire_dataset_training = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ test_scheduling_features.prev_failures(), ] if granularity == "label": feature_extractors += [ test_scheduling_features.platform(), # test_scheduling_features.chunk(), test_scheduling_features.suite(), ] elif granularity in ("group", "config_group"): feature_extractors += [ test_scheduling_features.path_distance(), test_scheduling_features.common_path_components(), test_scheduling_features.touched_together(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def evaluation(self) -> None: # Get a test set of pushes on which to test the model. pushes, train_push_len = self.get_pushes(False) # To evaluate the model with reductions enabled, we need to regenerate the failing together DB, using # only failure data from the training pushes (otherwise, we'd leak training information into the test # set). print("Generate failing together DB (restricted to training pushes)") push_data_iter, push_data_count, _ = test_scheduling.get_push_data( "label" if self.granularity == "label" else "config_group" ) test_scheduling.generate_failing_together_probabilities( "label" if self.granularity == "label" else "config_group", push_data_iter(), push_data_count, pushes[train_push_len - 1]["revs"][0], ) test_pushes_list = pushes[train_push_len:] all_tasks = reduce( lambda x, y: x | y, ( set(push["failures"]) | set(push["passes"]) for push in test_pushes_list[-28:] ), ) all_revs = set(sum((push["revs"] for push in test_pushes_list), [])) test_pushes_failures = sum( 1 for push in test_pushes_list if len(push["failures"]) > 0 ) test_pushes = {push["revs"][0]: push for push in test_pushes_list} if self.granularity == "group": for ( revisions, fix_revision, push_runnables, possible_regressions, likely_regressions, ) in tqdm(push_data_iter(), total=push_data_count): if revisions[0] not in test_pushes: continue test_pushes[revisions[0]]["config_group_failures"] = ( possible_regressions + likely_regressions ) print( f"Testing on {len(test_pushes)} ({test_pushes_failures} with failures) out of {len(pushes)}. {len(all_tasks)} schedulable tasks." ) del pushes commit_map = get_commit_map(all_revs) past_failures_data = test_scheduling.get_past_failures(self.granularity, True) last_push_num = past_failures_data["push_num"] past_failures_data.close() # Select tests for all the pushes in the test set. for i, push in enumerate(tqdm(test_pushes.values())): commits = tuple( commit_map.pop(revision) for revision in push["revs"] if revision in commit_map ) if len(commits) == 0: push["all_possibly_selected"] = {} continue push_num = last_push_num - (len(test_pushes) - (i + 1)) # Note: we subtract 100 to the push number to make sure we don't use # past failure data for the push itself. # The number 100 comes from the fact that in the past failure data # generation we store past failures in batches of 100 pushes. push["all_possibly_selected"] = self.select_tests( commits, 0.5, push_num - 100 ) def do_eval( executor: concurrent.futures.ProcessPoolExecutor, confidence_threshold: float, reduction: Optional[float], cap: Optional[int], minimum: Optional[int], ) -> None: futures: Dict[concurrent.futures.Future, Dict[str, Any]] = {} for push in test_pushes.values(): futures[ executor.submit( eval_apply_transforms, self, push, confidence_threshold, reduction, cap, minimum, ) ] = push for future in concurrent.futures.as_completed(futures): exc = future.exception() if exc is not None: print( "Exception {} while running {}".format( exc, futures[future]["revs"][0] ) ) for f in futures: f.cancel() push = futures[future] selected, group_configs = future.result() if reduction is not None and self.granularity == "group": push["number_configs"] = len( set( sum( group_configs.values(), [], ) ) ) selected_config_groups = set( (config, group) for group, configs in group_configs.items() for config in configs ) caught_config_groups = selected_config_groups & set( push["config_group_failures"] ) push["caught_one_config_group"] = ( len(caught_config_groups) > 0 if len(push["config_group_failures"]) != 0 else None ) push["caught_percentage_config_group"] = ( len(caught_config_groups) / len(push["config_group_failures"]) if len(push["config_group_failures"]) != 0 else None ) caught = selected & set(push["failures"]) push["number_scheduled"] = len(selected) push["caught_one"] = ( len(caught) > 0 if len(push["failures"]) != 0 else None ) push["some_didnt_run"] = ( not selected.issubset(set(push["passes"]) | set(push["failures"])), ) push["caught_percentage"] = ( len(caught) / len(push["failures"]) if len(push["failures"]) != 0 else None ) min_scheduled = min( result["number_scheduled"] for result in test_pushes.values() ) max_scheduled = max( result["number_scheduled"] for result in test_pushes.values() ) average_scheduled = statistics.mean( result["number_scheduled"] for result in test_pushes.values() ) num_failing_pushes = sum( 1 for result in test_pushes.values() if result["caught_one"] is not None ) num_caught_one = sum( 1 for result in test_pushes.values() if result["caught_one"] ) num_caught_one_or_some_didnt_run = sum( 1 for result in test_pushes.values() if result["caught_one"] or (result["caught_one"] is not None and result["some_didnt_run"]) ) percentage_caught_one = 100 * num_caught_one / num_failing_pushes percentage_caught_one_or_some_didnt_run = ( 100 * num_caught_one_or_some_didnt_run / num_failing_pushes ) average_caught_percentage = 100 * statistics.mean( result["caught_percentage"] for result in test_pushes.values() if result["caught_percentage"] is not None ) reduction_str = ( f"enabled at {reduction * 100}%" if reduction is not None else "disabled" ) message = f"For confidence threshold {confidence_threshold}, with reduction {reduction_str}, cap at {cap}, and minimum at {minimum}: scheduled {average_scheduled} tasks on average (min {min_scheduled}, max {max_scheduled}). In {percentage_caught_one}% of pushes we caught at least one failure ({percentage_caught_one_or_some_didnt_run}% ignoring misses when some of our selected tasks didn't run). On average, we caught {average_caught_percentage}% of all seen failures." if reduction is not None and self.granularity == "group": average_configs = statistics.mean( result["number_configs"] for result in test_pushes.values() ) median_configs = statistics.median( result["number_configs"] for result in test_pushes.values() ) message += f" On average, we selected {average_configs} configs (a median of {median_configs} configs)." num_caught_one_config_group = sum( 1 for result in test_pushes.values() if result["caught_one_config_group"] ) percentage_caught_one_config_group = ( 100 * num_caught_one_config_group / num_failing_pushes ) average_caught_percentage_config_group = 100 * statistics.mean( result["caught_percentage_config_group"] for result in test_pushes.values() if result["caught_percentage_config_group"] is not None ) message += f" In {percentage_caught_one_config_group}% of pushes we caught at least one config/group failure. On average, we caught {average_caught_percentage_config_group}% of all seen config/group failures." print(message) with concurrent.futures.ProcessPoolExecutor( max_workers=utils.get_physical_cpu_count() ) as executor: scenarios = [ (None, None, None), (10, None, None), (None, 300, None), (None, None, 0.9), (None, None, 1.0), ] for minimum, cap, reduction in scenarios: # Pre-generate equivalence sets, so when we run the config selection in multiple processes # we don't risk concurrent writes to the equivalence sets file. if reduction is not None and self.granularity == "group": self._get_equivalence_sets(reduction) for confidence_threshold in [0.5, 0.7, 0.8, 0.85, 0.9, 0.95]: do_eval(executor, confidence_threshold, reduction, cap, minimum)
def __init__( self, lemmatization: bool = False, interpretable: bool = True, use_finder: bool = False, exclude_finder: bool = True, finder_regressions_only: bool = False, ) -> None: CommitModel.__init__(self, lemmatization) self.training_dbs += [BUG_INTRODUCING_COMMITS_DB, bugzilla.BUGS_DB] if finder_regressions_only: self.training_dbs.append(BUG_FIXING_COMMITS_DB) self.store_dataset = True self.sampler = RandomUnderSampler(random_state=0) self.use_finder = use_finder self.exclude_finder = exclude_finder assert ( use_finder ^ exclude_finder ), "Using both use_finder and exclude_finder option does not make a lot of sense" self.finder_regressions_only = finder_regressions_only feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.functions_touched_num(), commit_features.functions_touched_size(), commit_features.source_code_file_metrics(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] column_transformers = [("data", DictVectorizer(), "data")] if not interpretable: column_transformers.append( ("desc", self.text_vectorizer(min_df=0.0001), "desc") ) self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ("union", ColumnTransformer(column_transformers)), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, historical=False, rca_subcategories_enabled=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False self.rca_subcategories_enabled = rca_subcategories_enabled # should we consider only the main category or all sub categories self.RCA_TYPES = (RCA_SUBCATEGORIES + RCA_CATEGORIES if rca_subcategories_enabled else RCA_CATEGORIES) self.RCA_LIST = sorted(set(self.RCA_TYPES)) feature_extractors = [ bug_features.has_str(), bug_features.severity(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), # Ignore whiteboards that would make the ML completely skewed # bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ]), ), ]) self.clf = OneVsRestClassifier( xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()))