def __init__(self, lemmatization=False): BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline( [ ("bug_extractor", bug_features.BugExtractor([], cleanup_functions)), ( "union", ColumnTransformer([("text", self.text_vectorizer(), "text")]), ), ] ) self.clf = LinearSVCWithLabelEncoding(LinearSVC())
def test_dll(): tests = [ ( "Crashing thread: 0 scdetour.dll scdetour.dll@0x2dd77", "Crashing thread: 0 __DLL_NAME__ __DLL_NAME__@0x2dd77", ), ( "Crash in libxul.so@0x287ad36 | libxul.so@0x270c062", "Crash in libxul.so@0x287ad36 | libxul.so@0x270c062", ), ("Crash in libsystem_pthread.dylib@0x14fc", "Crash in __DLL_NAME__@0x14fc"), ( "Crash in liblgpllibs.so@0x14fc exmpl.so@0xask ", "Crash in liblgpllibs.so@0x14fc __DLL_NAME__@0xask ", ), ( "Crash in lgpllibs.dll@0x14fc exmpl.dll@0xask ", "Crash in lgpllibs.dll@0x14fc __DLL_NAME__@0xask ", ), ( "Crash in libmozglue.dylib@0x14fc exmpl.dylib@0xask ", "Crash in libmozglue.dylib@0x14fc __DLL_NAME__@0xask ", ), ] for orig_text, cleaned_text in tests: assert feature_cleanup.dll()(orig_text) == cleaned_text
def __init__(self, training_size=14000, lemmatization=False, cleanup_urls=True): self.num_duplicates = training_size // 2 self.num_nondups_nondups = self.num_dup_nondups = training_size // 4 BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [ bug_features.is_same_product(), bug_features.is_same_component(), bug_features.is_same_platform(), bug_features.is_same_version(), bug_features.is_same_os(), bug_features.is_same_target_milestone(), bug_features.is_first_affected_same(), bug_features.couple_common_words_comments(), bug_features.couple_delta_creation_date(), bug_features.couple_common_keywords(), bug_features.couple_common_whiteboard_keywords(), bug_features.couple_common_words_summary(), ] cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: cleanup_functions.append(feature_cleanup.url()) self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True), ), ( "union", ColumnTransformer([ ("text", self.text_vectorizer(), "text"), ("couple_data", DictVectorizer(), "couple_data"), ]), ), ]) self.clf = XGBClassifier(n_jobs=utils.get_physical_cpu_count())
def __init__(self, cleanup_urls=True): self.cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: self.cleanup_functions.append(feature_cleanup.url())
def __init__(self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8): self.cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: self.cleanup_functions.append(feature_cleanup.url()) self.nltk_tokenizer = nltk_tokenizer self.confidence_threshold = confidence_threshold
def __init__(self, training_size=14000, lemmatization=False, cleanup_urls=True): self.num_duplicates = training_size // 2 self.num_nondups_nondups = self.num_dup_nondups = training_size // 4 BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [bug_features.is_same_product()] cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: cleanup_functions.append(feature_cleanup.url()) self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([("text", self.text_vectorizer(), "text")]), ), ]) self.clf = LinearSVCWithLabelEncoding(LinearSVC())
def __init__( self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8, end_to_end=False, ): self.cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: self.cleanup_functions.append(feature_cleanup.url()) self.nltk_tokenizer = nltk_tokenizer self.confidence_threshold = confidence_threshold self.duplicatemodel = (DuplicateModel.load("duplicatemodel") if end_to_end else None)
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False self.sampler = InstanceHardnessThreshold(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.product(), bug_features.component(), bug_features.is_mozillian(), bug_features.bug_reporter(), bug_features.blocked_bugs_number(), bug_features.priority(), bug_features.has_cve_in_alias(), bug_features.comment_count(), bug_features.comment_length(), bug_features.reporter_experience(), bug_features.number_of_bug_dependencies(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, ), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
from gensim.corpora import Dictionary from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer except ImportError: raise ImportError(OPT_MSG_MISSING) nltk.download("stopwords") REPORTERS_TO_IGNORE = { "*****@*****.**", "*****@*****.**" } cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] # A map from bug ID to its duplicate IDs duplicates = defaultdict(set) all_ids = set(bug["id"] for bug in bugzilla.get_bugs() if bug["creator"] not in REPORTERS_TO_IGNORE and "dupeme" not in bug["keywords"]) for bug in bugzilla.get_bugs(): dupes = [entry for entry in bug["duplicates"] if entry in all_ids] if bug["dupe_of"] in all_ids:
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = InstanceHardnessThreshold(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.product(), bug_features.component(), bug_features.is_mozillian(), bug_features.bug_reporter(), bug_features.blocked_bugs_number(), bug_features.priority(), bug_features.has_cve_in_alias(), bug_features.comment_count(), bug_features.comment_length(), bug_features.reporter_experience(), bug_features.number_of_bug_dependencies(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")