def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization, commit_data=True) self.cross_validation_enabled = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords({"dev-doc-needed", "dev-doc-complete"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.product(), bug_features.component(), bug_features.commit_added(), bug_features.commit_deleted(), bug_features.commit_types(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, commit_data=True, ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(), "title"), ("comments", self.text_vectorizer(), "comments"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline( [ ("bug_extractor", bug_features.BugExtractor([], cleanup_functions)), ( "union", ColumnTransformer([("text", self.text_vectorizer(), "text")]), ), ] ) self.clf = LinearSVCWithLabelEncoding(LinearSVC())
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.product(), # TODO: We would like to use the component at the time of filing too, # but we can't because the rollback script doesn't support changes to # components yet. # bug_features.component(), bug_features.num_words_title(), bug_features.num_words_comments(), bug_features.keywords(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, historical=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords(set(KEYWORD_DICT.keys())), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ]), ), ]) self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.cross_validation_enabled = False self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor") self.CONFLATED_COMPONENTS_INVERSE_MAPPING = { v: k for k, v in self.CONFLATED_COMPONENTS_MAPPING.items() }
def __init__(self, training_size=14000, lemmatization=False, cleanup_urls=True): self.num_duplicates = training_size // 2 self.num_nondups_nondups = self.num_dup_nondups = training_size // 4 BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [ bug_features.is_same_product(), bug_features.is_same_component(), bug_features.is_same_platform(), bug_features.is_same_version(), bug_features.is_same_os(), bug_features.is_same_target_milestone(), bug_features.is_first_affected_same(), bug_features.couple_common_words_comments(), bug_features.couple_delta_creation_date(), bug_features.couple_common_keywords(), bug_features.couple_common_whiteboard_keywords(), bug_features.couple_common_words_summary(), ] cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: cleanup_functions.append(feature_cleanup.url()) self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True), ), ( "union", ColumnTransformer([ ("text", self.text_vectorizer(), "text"), ("couple_data", DictVectorizer(), "couple_data"), ]), ), ]) self.clf = XGBClassifier(n_jobs=utils.get_physical_cpu_count())
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.sampler = InstanceHardnessThreshold(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.product(), bug_features.component(), bug_features.is_mozillian(), bug_features.bug_reporter(), bug_features.blocked_bugs_number(), bug_features.priority(), bug_features.has_cve_in_alias(), bug_features.comment_count(), bug_features.comment_length(), bug_features.reporter_experience(), bug_features.number_of_bug_dependencies() ] cleanup_functions = [ bug_features.cleanup_url, bug_features.cleanup_fileref, bug_features.cleanup_hex, bug_features.cleanup_dll, bug_features.cleanup_synonyms, bug_features.cleanup_crash, ] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback)), ('union', ColumnTransformer([ ('data', DictVectorizer(), 'data'), ('title', self.text_vectorizer(min_df=0.0001), 'title'), ('comments', self.text_vectorizer(min_df=0.0001), 'comments'), ])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor='cpu_predictor')
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = RandomUnderSampler(random_state=0) self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.product(), bug_features.component(), bug_features.num_words_title(), bug_features.num_words_comments(), bug_features.keywords(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(), "title"), ("comments", self.text_vectorizer(), "comments"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords({"qawanted"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), ] cleanup_functions = [ bug_features.cleanup_fileref, bug_features.cleanup_url, bug_features.cleanup_synonyms, ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(), "title"), ("comments", self.text_vectorizer(), "comments"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords( {'regression', 'talos-regression', 'feature'}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ bug_features.cleanup_url, bug_features.cleanup_fileref, bug_features.cleanup_synonyms, ] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)), ('union', ColumnTransformer([ ('data', DictVectorizer(), 'data'), ('title', self.text_vectorizer(min_df=0.001), 'title'), ('first_comment', self.text_vectorizer(min_df=0.001), 'first_comment'), ('comments', self.text_vectorizer(min_df=0.001), 'comments'), ])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor='cpu_predictor')
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords({'dev-doc-needed', 'dev-doc-complete'}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.product(), bug_features.component(), bug_features.commit_added(), bug_features.commit_deleted(), bug_features.commit_types(), ] cleanup_functions = [ bug_features.cleanup_fileref, bug_features.cleanup_url, bug_features.cleanup_synonyms, ] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, commit_data=True)), ('union', ColumnTransformer([ ('data', DictVectorizer(), 'data'), ('title', self.text_vectorizer(stop_words='english'), 'title'), ('comments', self.text_vectorizer(stop_words='english'), 'comments'), ])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor='cpu_predictor')
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.undersampling_enabled = False self.cross_validation_enabled = False feature_extractors = [ bug_features.has_str(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), ] cleanup_functions = [ bug_features.cleanup_fileref, bug_features.cleanup_url, bug_features.cleanup_synonyms, ] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)), ( 'union', ColumnTransformer([ # TODO: Re-enable when we'll support bug snapshotting (#5). # ('data', DictVectorizer(), 'data'), ('title', self.text_vectorizer(stop_words='english'), 'title'), # TODO: Re-enable when we'll support bug snapshotting (#5). # ('comments', self.text_vectorizer(stop_words='english'), 'comments'), ('first_comment', self.text_vectorizer(stop_words='english'), 'first_comment'), ])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor='cpu_predictor')
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords({"stepswanted"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(), "title"), ("comments", self.text_vectorizer(), "comments"), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.cross_validation_enabled = False self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), ] cleanup_functions = [ bug_features.cleanup_fileref, bug_features.cleanup_url, bug_features.cleanup_synonyms, ] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True)), ('union', ColumnTransformer([ ('data', DictVectorizer(), 'data'), ('title', self.text_vectorizer(min_df=0.0001), 'title'), ('comments', self.text_vectorizer(min_df=0.0001), 'comments'), ])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor='cpu_predictor')
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), ] cleanup_functions = [ bug_features.cleanup_fileref, bug_features.cleanup_url, bug_features.cleanup_synonyms, ] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)), ('union', ColumnTransformer([ ('data', DictVectorizer(), 'data'), ('title', self.text_vectorizer(), 'title'), ('comments', self.text_vectorizer(), 'comments'), ])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor='cpu_predictor')
def __init__(self, training_size=14000, lemmatization=False, cleanup_urls=True): self.num_duplicates = training_size // 2 self.num_nondups_nondups = self.num_dup_nondups = training_size // 4 BugCoupleModel.__init__(self, lemmatization) self.calculate_importance = False feature_extractors = [bug_features.is_same_product()] cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] if cleanup_urls: cleanup_functions.append(feature_cleanup.url()) self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([("text", self.text_vectorizer(), "text")]), ), ]) self.clf = LinearSVCWithLabelEncoding(LinearSVC())
def __init__(self, lemmatization=False, historical=False, rca_subcategories_enabled=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False self.rca_subcategories_enabled = rca_subcategories_enabled # should we consider only the main category or all sub categories self.RCA_TYPES = (RCA_SUBCATEGORIES + RCA_CATEGORIES if rca_subcategories_enabled else RCA_CATEGORIES) self.RCA_LIST = sorted(set(self.RCA_TYPES)) feature_extractors = [ bug_features.has_str(), bug_features.severity(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), # Ignore whiteboards that would make the ML completely skewed # bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ]), ), ]) self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False self.sampler = InstanceHardnessThreshold(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.product(), bug_features.component(), bug_features.is_mozillian(), bug_features.bug_reporter(), bug_features.blocked_bugs_number(), bug_features.priority(), bug_features.has_cve_in_alias(), bug_features.comment_count(), bug_features.comment_length(), bug_features.reporter_experience(), bug_features.number_of_bug_dependencies(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, ), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, historical=False): BugModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords( {"regression", "talos-regression", "feature"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] if historical: feature_extractors.append(bug_features.had_severity_enhancement()) cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.short_desc_maxlen = 20 self.short_desc_vocab_size = 25000 self.short_desc_emb_sz = 300 self.long_desc_maxlen = 100 self.long_desc_vocab_size = 25000 self.long_desc_emb_sz = 300 self.cross_validation_enabled = False self.params = [ { "short_desc_emb_dropout_rate": 0.2, "short_desc_encoded_gru_units": 256, "short_desc_encoded_gru_dropout": 0.45, "short_desc_encoded_recurrent_dropout": 0.5, "long_desc_emb_dropout_rate": 0.25, "long_desc_encoded_gru_units": 256, "long_desc_encoded_dropout": 0.5, "long_desc_encoded_recurrent_dropout": 0.55, "rep_platform_emb_input_dim": 14, "rep_platform_emb_output_dim": 25, "rep_platform_emb_spatial_dropout_rate": 0.1, "rep_platform_emb_dropout_rate": 0.45, "op_sys_emb_input_dim": 48, "op_sys_emb_output_dim": 50, "op_sys_emb_spatial_dropout_rate": 0.1, "op_sys_emb_dropout_rate": 0.45, "reporter_emb_input_dim": 46544, "reporter_emb_output_dim": 100, "reporter_emb_spatial_dropout_rate": 0.15, "reporter_emb_dropout_rate": 0.5, "tfidf_word_dense_units": 600, "tfidf_word_dropout_rate": 0.5, "tfidf_char_inp_dense_unit": 500, "tfidf_char_inp_dropout_rate": 0.5, "x_dense_unit": 2000, "x_dropout_rate": 0.6, }, { "short_desc_emb_dropout_rate": 0.2, "short_desc_encoded_gru_units": 250, "short_desc_encoded_gru_dropout": 0.45, "short_desc_encoded_recurrent_dropout": 0.45, "long_desc_emb_dropout_rate": 0.25, "long_desc_encoded_gru_units": 250, "long_desc_encoded_dropout": 0.45, "long_desc_encoded_recurrent_dropout": 0.45, "rep_platform_emb_input_dim": 14, "rep_platform_emb_output_dim": 30, "rep_platform_emb_spatial_dropout_rate": 0.1, "rep_platform_emb_dropout_rate": 0.4, "op_sys_emb_input_dim": 48, "op_sys_emb_output_dim": 55, "op_sys_emb_spatial_dropout_rate": 0.1, "op_sys_emb_dropout_rate": 0.4, "reporter_emb_input_dim": 46544, "reporter_emb_output_dim": 110, "reporter_emb_spatial_dropout_rate": 0.15, "reporter_emb_dropout_rate": 0.45, "tfidf_word_dense_units": 610, "tfidf_word_dropout_rate": 0.45, "tfidf_char_inp_dense_unit": 510, "tfidf_char_inp_dropout_rate": 0.5, "x_dense_unit": 1970, "x_dropout_rate": 0.5, }, { "short_desc_emb_dropout_rate": 0.2, "short_desc_encoded_gru_units": 266, "short_desc_encoded_gru_dropout": 0.45, "short_desc_encoded_recurrent_dropout": 0.45, "long_desc_emb_dropout_rate": 0.25, "long_desc_encoded_gru_units": 266, "long_desc_encoded_dropout": 0.45, "long_desc_encoded_recurrent_dropout": 0.55, "rep_platform_emb_input_dim": 14, "rep_platform_emb_output_dim": 35, "rep_platform_emb_spatial_dropout_rate": 0.1, "rep_platform_emb_dropout_rate": 0.45, "op_sys_emb_input_dim": 48, "op_sys_emb_output_dim": 60, "op_sys_emb_spatial_dropout_rate": 0.1, "op_sys_emb_dropout_rate": 0.45, "reporter_emb_input_dim": 46544, "reporter_emb_output_dim": 120, "reporter_emb_spatial_dropout_rate": 0.15, "reporter_emb_dropout_rate": 0.45, "tfidf_word_dense_units": 620, "tfidf_word_dropout_rate": 0.5, "tfidf_char_inp_dense_unit": 520, "tfidf_char_inp_dropout_rate": 0.45, "x_dense_unit": 1950, "x_dropout_rate": 0.5, }, ] feature_extractors = [ bug_features.bug_reporter(), bug_features.platform(), bug_features.op_sys(), ] cleanup_functions = [] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", StructuredColumnTransformer([ ( "platform", make_pipeline(DictExtractor("platform"), OrdinalEncoder()), "data", ), ( "op_sys", make_pipeline(DictExtractor("op_sys"), OrdinalEncoder()), "data", ), ( "bug_reporter", make_pipeline( DictExtractor("bug_reporter"), MissingOrdinalEncoder(), ), "data", ), ( "title_sequence", KerasTextToSequences(self.short_desc_maxlen, self.short_desc_vocab_size), "title", ), ( "first_comment_sequence", KerasTextToSequences(self.long_desc_maxlen, self.long_desc_vocab_size), "first_comment", ), ( "title_char_tfidf", TfidfVectorizer( strip_accents="unicode", analyzer="char", stop_words="english", ngram_range=(2, 4), max_features=25000, sublinear_tf=True, ), "title", ), ( "title_word_tfidf", TfidfVectorizer( strip_accents="unicode", min_df=0.0001, max_df=0.1, analyzer="word", token_pattern=r"\w{1,}", stop_words="english", ngram_range=(2, 4), max_features=30000, sublinear_tf=True, ), "title", ), ]), ), ]) kwargs = { "short_desc_maxlen": self.short_desc_maxlen, "short_desc_vocab_size": self.short_desc_vocab_size, "short_desc_emb_sz": self.short_desc_emb_sz, "long_desc_maxlen": self.long_desc_maxlen, "long_desc_vocab_size": self.long_desc_vocab_size, "long_desc_emb_sz": self.long_desc_emb_sz, } estimators = [] for i, params in enumerate(self.params): kwargs["params"] = params estimator = ComponentNNClassifier(**kwargs) estimators.append(("model_{}".format(i), estimator)) self.clf = VotingClassifier(estimators=estimators, voting="soft", weights=[1, 1, 1])
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.short_desc_maxlen = 20 self.short_desc_vocab_size = 25000 self.short_desc_emb_sz = 300 self.long_desc_maxlen = 100 self.long_desc_vocab_size = 25000 self.long_desc_emb_sz = 300 self.cross_validation_enabled = False feature_extractors = [ bug_features.bug_reporter(), bug_features.platform(), bug_features.op_sys() ] cleanup_functions = [] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)), ('union', StructuredColumnTransformer([ ('platform', make_pipeline(DictExtractor('platform'), OrdinalEncoder()), 'data'), ('op_sys', make_pipeline(DictExtractor('op_sys'), OrdinalEncoder()), 'data'), ('bug_reporter', make_pipeline(DictExtractor('bug_reporter'), OrdinalEncoder()), 'data'), ('title_sequence', KerasTextToSequences(self.short_desc_maxlen, self.short_desc_vocab_size), 'title'), ('first_comment_sequence', KerasTextToSequences(self.long_desc_maxlen, self.long_desc_vocab_size), 'first_comment'), ('title_char_tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='char', stop_words='english', ngram_range=(2, 4), max_features=25000, sublinear_tf=True), 'title'), ('title_word_tfidf', TfidfVectorizer(strip_accents='unicode', min_df=0.0001, max_df=0.1, analyzer='word', token_pattern=r'\w{1,}', stop_words='english', ngram_range=(2, 4), max_features=30000, sublinear_tf=True), 'title') ])), ]) kwargs = { 'short_desc_maxlen': self.short_desc_maxlen, 'short_desc_vocab_size': self.short_desc_vocab_size, 'short_desc_emb_sz': self.short_desc_emb_sz, 'long_desc_maxlen': self.long_desc_maxlen, 'long_desc_vocab_size': self.long_desc_vocab_size, 'long_desc_emb_sz': self.long_desc_emb_sz } self.clf = ComponentNNClassifier(**kwargs)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.short_desc_maxlen = 20 self.short_desc_vocab_size = 25000 self.short_desc_emb_sz = 300 self.long_desc_maxlen = 100 self.long_desc_vocab_size = 25000 self.long_desc_emb_sz = 300 self.cross_validation_enabled = False self.params = [{ 'short_desc_emb_dropout_rate': 0.2, 'short_desc_encoded_gru_units': 256, 'short_desc_encoded_gru_dropout': 0.45, 'short_desc_encoded_recurrent_dropout': 0.5, 'long_desc_emb_dropout_rate': 0.25, 'long_desc_encoded_gru_units': 256, 'long_desc_encoded_dropout': 0.5, 'long_desc_encoded_recurrent_dropout': 0.55, 'rep_platform_emb_input_dim': 14, 'rep_platform_emb_output_dim': 25, 'rep_platform_emb_spatial_dropout_rate': 0.1, 'rep_platform_emb_dropout_rate': 0.45, 'op_sys_emb_input_dim': 48, 'op_sys_emb_output_dim': 50, 'op_sys_emb_spatial_dropout_rate': 0.1, 'op_sys_emb_dropout_rate': 0.45, 'reporter_emb_input_dim': 46544, 'reporter_emb_output_dim': 100, 'reporter_emb_spatial_dropout_rate': 0.15, 'reporter_emb_dropout_rate': 0.5, 'tfidf_word_dense_units': 600, 'tfidf_word_dropout_rate': 0.5, 'tfidf_char_inp_dense_unit': 500, 'tfidf_char_inp_dropout_rate': 0.5, 'x_dense_unit': 2000, 'x_dropout_rate': 0.6, }, { 'short_desc_emb_dropout_rate': 0.2, 'short_desc_encoded_gru_units': 250, 'short_desc_encoded_gru_dropout': 0.45, 'short_desc_encoded_recurrent_dropout': 0.45, 'long_desc_emb_dropout_rate': 0.25, 'long_desc_encoded_gru_units': 250, 'long_desc_encoded_dropout': 0.45, 'long_desc_encoded_recurrent_dropout': 0.45, 'rep_platform_emb_input_dim': 14, 'rep_platform_emb_output_dim': 30, 'rep_platform_emb_spatial_dropout_rate': 0.1, 'rep_platform_emb_dropout_rate': 0.4, 'op_sys_emb_input_dim': 48, 'op_sys_emb_output_dim': 55, 'op_sys_emb_spatial_dropout_rate': 0.1, 'op_sys_emb_dropout_rate': 0.4, 'reporter_emb_input_dim': 46544, 'reporter_emb_output_dim': 110, 'reporter_emb_spatial_dropout_rate': 0.15, 'reporter_emb_dropout_rate': 0.45, 'tfidf_word_dense_units': 610, 'tfidf_word_dropout_rate': 0.45, 'tfidf_char_inp_dense_unit': 510, 'tfidf_char_inp_dropout_rate': 0.5, 'x_dense_unit': 1970, 'x_dropout_rate': 0.5, }, { 'short_desc_emb_dropout_rate': 0.2, 'short_desc_encoded_gru_units': 266, 'short_desc_encoded_gru_dropout': 0.45, 'short_desc_encoded_recurrent_dropout': 0.45, 'long_desc_emb_dropout_rate': 0.25, 'long_desc_encoded_gru_units': 266, 'long_desc_encoded_dropout': 0.45, 'long_desc_encoded_recurrent_dropout': 0.55, 'rep_platform_emb_input_dim': 14, 'rep_platform_emb_output_dim': 35, 'rep_platform_emb_spatial_dropout_rate': 0.1, 'rep_platform_emb_dropout_rate': 0.45, 'op_sys_emb_input_dim': 48, 'op_sys_emb_output_dim': 60, 'op_sys_emb_spatial_dropout_rate': 0.1, 'op_sys_emb_dropout_rate': 0.45, 'reporter_emb_input_dim': 46544, 'reporter_emb_output_dim': 120, 'reporter_emb_spatial_dropout_rate': 0.15, 'reporter_emb_dropout_rate': 0.45, 'tfidf_word_dense_units': 620, 'tfidf_word_dropout_rate': 0.5, 'tfidf_char_inp_dense_unit': 520, 'tfidf_char_inp_dropout_rate': 0.45, 'x_dense_unit': 1950, 'x_dropout_rate': 0.5, }] feature_extractors = [ bug_features.bug_reporter(), bug_features.platform(), bug_features.op_sys() ] cleanup_functions = [] self.extraction_pipeline = Pipeline([ ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)), ('union', StructuredColumnTransformer([ ('platform', make_pipeline(DictExtractor('platform'), OrdinalEncoder()), 'data'), ('op_sys', make_pipeline(DictExtractor('op_sys'), OrdinalEncoder()), 'data'), ('bug_reporter', make_pipeline(DictExtractor('bug_reporter'), MissingOrdinalEncoder()), 'data'), ('title_sequence', KerasTextToSequences(self.short_desc_maxlen, self.short_desc_vocab_size), 'title'), ('first_comment_sequence', KerasTextToSequences(self.long_desc_maxlen, self.long_desc_vocab_size), 'first_comment'), ('title_char_tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='char', stop_words='english', ngram_range=(2, 4), max_features=25000, sublinear_tf=True), 'title'), ('title_word_tfidf', TfidfVectorizer(strip_accents='unicode', min_df=0.0001, max_df=0.1, analyzer='word', token_pattern=r'\w{1,}', stop_words='english', ngram_range=(2, 4), max_features=30000, sublinear_tf=True), 'title') ])), ]) kwargs = { 'short_desc_maxlen': self.short_desc_maxlen, 'short_desc_vocab_size': self.short_desc_vocab_size, 'short_desc_emb_sz': self.short_desc_emb_sz, 'long_desc_maxlen': self.long_desc_maxlen, 'long_desc_vocab_size': self.long_desc_vocab_size, 'long_desc_emb_sz': self.long_desc_emb_sz } estimators = [] for i, params in enumerate(self.params): kwargs['params'] = params estimator = ComponentNNClassifier(**kwargs) estimators.append(('model_{}'.format(i), estimator)) self.clf = VotingClassifier(estimators=estimators, voting='soft', weights=[1, 1, 1])