def test_CommitExtractor(): CommitExtractor([reviewers_num(), author_experience()], [fileref(), url()]) with pytest.raises(AssertionError): CommitExtractor([reviewers_num(), author_experience()], [fileref(), fileref()]) with pytest.raises(AssertionError): CommitExtractor( [author_experience(), author_experience()], [fileref(), url()])
def __init__(self, lemmatization=False): IssueModel.__init__(self, lemmatization) feature_extractors = [ issue_features.comment_count(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "issue_extractor", issue_features.IssueExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.0001), "first_comment", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline( [ ("bug_extractor", bug_features.BugExtractor([], cleanup_functions)), ( "union", ColumnTransformer([("text", self.text_vectorizer(), "text")]), ), ] ) self.clf = LinearSVCWithLabelEncoding(LinearSVC())
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization, commit_data=True) self.cross_validation_enabled = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords({"dev-doc-needed", "dev-doc-complete"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.product(), bug_features.component(), bug_features.commit_added(), bug_features.commit_deleted(), bug_features.commit_types(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, commit_data=True, ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(), "title"), ("comments", self.text_vectorizer(), "comments"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, interpretable=False): CommitModel.__init__(self, lemmatization) self.required_dbs.append(BUG_INTRODUCING_COMMITS_DB) self.store_dataset = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.functions_touched_num(), commit_features.functions_touched_size(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] column_transformers = [("data", DictVectorizer(), "data")] if not interpretable: column_transformers.append( ("desc", self.text_vectorizer(min_df=0.0001), "desc")) self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, cleanup_functions), ), ("union", ColumnTransformer(column_transformers)), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, historical=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords(set(KEYWORD_DICT.keys())), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ]), ), ]) self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.product(), # TODO: We would like to use the component at the time of filing too, # but we can't because the rollback script doesn't support changes to # components yet. # bug_features.component(), bug_features.num_words_title(), bug_features.num_words_comments(), bug_features.keywords(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords({"dev-doc-needed", "dev-doc-complete"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.product(), bug_features.component(), bug_features.commit_added(), bug_features.commit_deleted(), bug_features.commit_types(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, commit_data=True, ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(), "title"), ("comments", self.text_vectorizer(), "comments"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.cross_validation_enabled = False self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor") self.CONFLATED_COMPONENTS_INVERSE_MAPPING = { v: k for k, v in self.CONFLATED_COMPONENTS_MAPPING.items() }
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.cross_validation_enabled = False self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor") self.CONFLATED_COMPONENTS_INVERSE_MAPPING = { v: k for k, v in self.CONFLATED_COMPONENTS_MAPPING.items() }
def __init__(self, lemmatization: bool = False) -> None: CommitModel.__init__(self, lemmatization) self.calculate_importance = False self.training_dbs += [bugzilla.BUGS_DB] self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.reviewers_num(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.functions_touched_num(), commit_features.functions_touched_size(), commit_features.source_code_file_metrics(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(min_df=0.0001), "desc"), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): CommitModel.__init__(self, lemmatization) self.calculate_importance = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.file_size(), commit_features.test_added(), commit_features.added(), commit_features.deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.files(), commit_features.files_modified_num(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(), "desc"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, training_size=14000, lemmatization=False, cleanup_urls=True): self.num_duplicates = training_size // 2 self.num_nondups_nondups = self.num_dup_nondups = training_size // 4 BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [ bug_features.is_same_product(), bug_features.is_same_component(), bug_features.is_same_platform(), bug_features.is_same_version(), bug_features.is_same_os(), bug_features.is_same_target_milestone(), bug_features.is_first_affected_same(), bug_features.couple_common_words_comments(), bug_features.couple_delta_creation_date(), bug_features.couple_common_keywords(), bug_features.couple_common_whiteboard_keywords(), bug_features.couple_common_words_summary(), ] cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: cleanup_functions.append(feature_cleanup.url()) self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True), ), ( "union", ColumnTransformer([ ("text", self.text_vectorizer(), "text"), ("couple_data", DictVectorizer(), "couple_data"), ]), ), ]) self.clf = XGBClassifier(n_jobs=utils.get_physical_cpu_count())
def __init__(self, cleanup_urls=True): self.cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: self.cleanup_functions.append(feature_cleanup.url())
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = RandomUnderSampler(random_state=0) self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.product(), bug_features.component(), bug_features.num_words_title(), bug_features.num_words_comments(), bug_features.keywords(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(), "title"), ("comments", self.text_vectorizer(), "comments"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): CommitModel.__init__(self, lemmatization) self.calculate_importance = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.files_modified_num(), commit_features.test_added(), commit_features.added(), commit_features.deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.author_experience_90_days(), commit_features.reviewer_experience(), commit_features.reviewer_experience_90_days(), commit_features.components_touched_prev(), commit_features.components_touched_prev_90_days(), commit_features.files_touched_prev(), commit_features.files_touched_prev_90_days(), commit_features.types(), commit_features.components(), commit_features.number_of_reviewers(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(), "desc"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8): self.cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: self.cleanup_functions.append(feature_cleanup.url()) self.nltk_tokenizer = nltk_tokenizer self.confidence_threshold = confidence_threshold
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords({"stepswanted"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(), "title"), ("comments", self.text_vectorizer(), "comments"), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def test_url(): tests = [ ( "This code lies in https://github.com/marco-c/bugbug", "This code lies in __URL__", ), ( "Another url can be https://hg.mozilla.org/camino/ or https://google.com", "Another url can be __CODE_REFERENCE_URL__ or __URL__", ), ( "Third example is https://searchfox.org and http://hg.mozilla.org", "Third example is __CODE_REFERENCE_URL__ and __CODE_REFERENCE_URL__", ), ( "More generic links can be https://github.com/marco-c/bugbug , https://hg.mozilla.org/try/ and https://searchfox.org", "More generic links can be __URL__ , __CODE_REFERENCE_URL__ and __CODE_REFERENCE_URL__", ), ] for orig_text, cleaned_text in tests: assert feature_cleanup.url()(orig_text) == cleaned_text
def __init__(self, training_size=14000, lemmatization=False, cleanup_urls=True): self.num_duplicates = training_size // 2 self.num_nondups_nondups = self.num_dup_nondups = training_size // 4 BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [bug_features.is_same_product()] cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: cleanup_functions.append(feature_cleanup.url()) self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([("text", self.text_vectorizer(), "text")]), ), ]) self.clf = LinearSVCWithLabelEncoding(LinearSVC())
def __init__(self, lemmatization=False): IssueModel.__init__(self, owner="webcompat", repo="web-bugs", lemmatization=lemmatization) self.calculate_importance = False feature_extractors = [] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "issue_extractor", issue_features.IssueExtractor(feature_extractors, cleanup_functions, rollback=True), ), ( "union", ColumnTransformer([ ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.0001), "first_comment", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__( self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8, end_to_end=False, ): self.cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: self.cleanup_functions.append(feature_cleanup.url()) self.nltk_tokenizer = nltk_tokenizer self.confidence_threshold = confidence_threshold self.duplicatemodel = (DuplicateModel.load("duplicatemodel") if end_to_end else None)
def __init__(self, lemmatization=False, historical=False, rca_subcategories_enabled=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False self.rca_subcategories_enabled = rca_subcategories_enabled # should we consider only the main category or all sub categories self.RCA_TYPES = (RCA_SUBCATEGORIES + RCA_CATEGORIES if rca_subcategories_enabled else RCA_CATEGORIES) self.RCA_LIST = sorted(set(self.RCA_TYPES)) feature_extractors = [ bug_features.has_str(), bug_features.severity(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), # Ignore whiteboards that would make the ML completely skewed # bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ]), ), ]) self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
def __init__(self, lemmatization=False, historical=False): BugModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords(set(keyword_dict.keys())), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ] ), ), ] ) self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
def __init__(self, lemmatization=False, bug_data=False): CommitModel.__init__(self, lemmatization, bug_data) self.calculate_importance = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.components(), commit_features.directories(), commit_features.files(), ] if bug_data: feature_extractors += [ bug_features.product(), bug_features.component(), bug_features.severity(), bug_features.priority(), bug_features.has_crash_signature(), bug_features.has_regression_range(), bug_features.whiteboard(), bug_features.keywords(), bug_features.number_of_bug_dependencies(), bug_features.blocked_bugs_number(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(), "desc"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False self.sampler = InstanceHardnessThreshold(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.product(), bug_features.component(), bug_features.is_mozillian(), bug_features.bug_reporter(), bug_features.blocked_bugs_number(), bug_features.priority(), bug_features.has_cve_in_alias(), bug_features.comment_count(), bug_features.comment_length(), bug_features.reporter_experience(), bug_features.number_of_bug_dependencies(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, ), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def test_BugExtractor(): BugExtractor([has_str(), has_url()], [fileref(), url()]) with pytest.raises(AssertionError): BugExtractor([has_str(), has_str()], [fileref(), url()]) with pytest.raises(AssertionError): BugExtractor([has_str(), has_url()], [fileref(), fileref()])
def __init__(self, lemmatization=False, historical=False): BugModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords( {"regression", "talos-regression", "feature"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] if historical: feature_extractors.append(bug_features.had_severity_enhancement()) cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
from nltk.stem.porter import PorterStemmer except ImportError: raise ImportError(OPT_MSG_MISSING) nltk.download("stopwords") REPORTERS_TO_IGNORE = { "*****@*****.**", "*****@*****.**" } cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] # A map from bug ID to its duplicate IDs duplicates = defaultdict(set) all_ids = set(bug["id"] for bug in bugzilla.get_bugs() if bug["creator"] not in REPORTERS_TO_IGNORE and "dupeme" not in bug["keywords"]) for bug in bugzilla.get_bugs(): dupes = [entry for entry in bug["duplicates"] if entry in all_ids] if bug["dupe_of"] in all_ids: dupes.append(bug["dupe_of"])
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = InstanceHardnessThreshold(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.product(), bug_features.component(), bug_features.is_mozillian(), bug_features.bug_reporter(), bug_features.blocked_bugs_number(), bug_features.priority(), bug_features.has_cve_in_alias(), bug_features.comment_count(), bug_features.comment_length(), bug_features.reporter_experience(), bug_features.number_of_bug_dependencies(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__( self, lemmatization: bool = False, interpretable: bool = True, use_finder: bool = False, exclude_finder: bool = True, finder_regressions_only: bool = False, ) -> None: CommitModel.__init__(self, lemmatization) self.training_dbs += [BUG_INTRODUCING_COMMITS_DB, bugzilla.BUGS_DB] if finder_regressions_only: self.training_dbs.append(BUG_FIXING_COMMITS_DB) self.store_dataset = True self.sampler = RandomUnderSampler(random_state=0) self.use_finder = use_finder self.exclude_finder = exclude_finder assert ( use_finder ^ exclude_finder ), "Using both use_finder and exclude_finder option does not make a lot of sense" self.finder_regressions_only = finder_regressions_only feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.functions_touched_num(), commit_features.functions_touched_size(), commit_features.source_code_file_metrics(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] column_transformers = [("data", DictVectorizer(), "data")] if not interpretable: column_transformers.append( ("desc", self.text_vectorizer(min_df=0.0001), "desc") ) self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ("union", ColumnTransformer(column_transformers)), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")