Esempio n. 1
0
    def __init__(self, lemmatization=False):
        BugModel.__init__(self, lemmatization, commit_data=True)

        self.cross_validation_enabled = False

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.keywords({"dev-doc-needed", "dev-doc-complete"}),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.product(),
            bug_features.component(),
            bug_features.commit_added(),
            bug_features.commit_deleted(),
            bug_features.commit_types(),
        ]

        cleanup_functions = [
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline(
            [
                (
                    "bug_extractor",
                    bug_features.BugExtractor(
                        feature_extractors,
                        cleanup_functions,
                        rollback=True,
                        rollback_when=self.rollback,
                        commit_data=True,
                    ),
                ),
                (
                    "union",
                    ColumnTransformer(
                        [
                            ("data", DictVectorizer(), "data"),
                            ("title", self.text_vectorizer(), "title"),
                            ("comments", self.text_vectorizer(), "comments"),
                        ]
                    ),
                ),
            ]
        )

        self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
        self.clf.set_params(predictor="cpu_predictor")
Esempio n. 2
0
    def __init__(self, lemmatization=False):
        BugCoupleModel.__init__(self, lemmatization)

        self.calculate_importance = False

        cleanup_functions = [
            feature_cleanup.responses(),
            feature_cleanup.hex(),
            feature_cleanup.dll(),
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
            feature_cleanup.crash(),
        ]

        self.extraction_pipeline = Pipeline(
            [
                ("bug_extractor", bug_features.BugExtractor([], cleanup_functions)),
                (
                    "union",
                    ColumnTransformer([("text", self.text_vectorizer(), "text")]),
                ),
            ]
        )

        self.clf = LinearSVCWithLabelEncoding(LinearSVC())
Esempio n. 3
0
    def __init__(self, lemmatization=False):
        BugModel.__init__(self, lemmatization)

        self.sampler = BorderlineSMOTE(random_state=0)
        self.calculate_importance = False

        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.product(),
            # TODO: We would like to use the component at the time of filing too,
            # but we can't because the rollback script doesn't support changes to
            # components yet.
            # bug_features.component(),
            bug_features.num_words_title(),
            bug_features.num_words_comments(),
            bug_features.keywords(),
        ]

        cleanup_functions = [
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline(
            [
                (
                    "bug_extractor",
                    bug_features.BugExtractor(
                        feature_extractors, cleanup_functions, rollback=True
                    ),
                ),
                (
                    "union",
                    ColumnTransformer(
                        [
                            ("data", DictVectorizer(), "data"),
                            ("title", self.text_vectorizer(min_df=0.0001), "title"),
                            (
                                "comments",
                                self.text_vectorizer(min_df=0.0001),
                                "comments",
                            ),
                        ]
                    ),
                ),
            ]
        )

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
Esempio n. 4
0
    def __init__(self, lemmatization=False, historical=False):
        BugModel.__init__(self, lemmatization)

        self.calculate_importance = False

        feature_extractors = [
            bug_features.has_str(),
            bug_features.severity(),
            # Ignore keywords that would make the ML completely skewed
            # (we are going to use them as 100% rules in the evaluation phase).
            bug_features.keywords(set(KEYWORD_DICT.keys())),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
            bug_features.blocked_bugs_number(),
            bug_features.ever_affected(),
            bug_features.affected_then_unaffected(),
            bug_features.product(),
            bug_features.component(),
        ]

        cleanup_functions = [
            feature_cleanup.url(),
            feature_cleanup.fileref(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline([
            (
                "bug_extractor",
                bug_features.BugExtractor(feature_extractors,
                                          cleanup_functions),
            ),
            (
                "union",
                ColumnTransformer([
                    ("data", DictVectorizer(), "data"),
                    ("title", self.text_vectorizer(min_df=0.001), "title"),
                    (
                        "first_comment",
                        self.text_vectorizer(min_df=0.001),
                        "first_comment",
                    ),
                    (
                        "comments",
                        self.text_vectorizer(min_df=0.001),
                        "comments",
                    ),
                ]),
            ),
        ])

        self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
Esempio n. 5
0
    def __init__(self, lemmatization=False):
        BugModel.__init__(self, lemmatization)

        self.cross_validation_enabled = False
        self.calculate_importance = False

        feature_extractors = [
            bug_features.has_str(),
            bug_features.severity(),
            bug_features.keywords(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
        ]

        cleanup_functions = [
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline(
            [
                (
                    "bug_extractor",
                    bug_features.BugExtractor(
                        feature_extractors, cleanup_functions, rollback=True
                    ),
                ),
                (
                    "union",
                    ColumnTransformer(
                        [
                            ("data", DictVectorizer(), "data"),
                            ("title", self.text_vectorizer(min_df=0.0001), "title"),
                            (
                                "comments",
                                self.text_vectorizer(min_df=0.0001),
                                "comments",
                            ),
                        ]
                    ),
                ),
            ]
        )

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")

        self.CONFLATED_COMPONENTS_INVERSE_MAPPING = {
            v: k for k, v in self.CONFLATED_COMPONENTS_MAPPING.items()
        }
Esempio n. 6
0
    def __init__(self,
                 training_size=14000,
                 lemmatization=False,
                 cleanup_urls=True):
        self.num_duplicates = training_size // 2
        self.num_nondups_nondups = self.num_dup_nondups = training_size // 4

        BugCoupleModel.__init__(self, lemmatization)

        self.calculate_importance = False

        feature_extractors = [
            bug_features.is_same_product(),
            bug_features.is_same_component(),
            bug_features.is_same_platform(),
            bug_features.is_same_version(),
            bug_features.is_same_os(),
            bug_features.is_same_target_milestone(),
            bug_features.is_first_affected_same(),
            bug_features.couple_common_words_comments(),
            bug_features.couple_delta_creation_date(),
            bug_features.couple_common_keywords(),
            bug_features.couple_common_whiteboard_keywords(),
            bug_features.couple_common_words_summary(),
        ]

        cleanup_functions = [
            feature_cleanup.responses(),
            feature_cleanup.hex(),
            feature_cleanup.dll(),
            feature_cleanup.fileref(),
            feature_cleanup.synonyms(),
            feature_cleanup.crash(),
        ]

        if cleanup_urls:
            cleanup_functions.append(feature_cleanup.url())

        self.extraction_pipeline = Pipeline([
            (
                "bug_extractor",
                bug_features.BugExtractor(feature_extractors,
                                          cleanup_functions,
                                          rollback=True),
            ),
            (
                "union",
                ColumnTransformer([
                    ("text", self.text_vectorizer(), "text"),
                    ("couple_data", DictVectorizer(), "couple_data"),
                ]),
            ),
        ])

        self.clf = XGBClassifier(n_jobs=utils.get_physical_cpu_count())
Esempio n. 7
0
    def __init__(self, lemmatization=False):
        Model.__init__(self, lemmatization)

        self.sampler = InstanceHardnessThreshold(random_state=0)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.keywords(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
            bug_features.product(),
            bug_features.component(),
            bug_features.is_mozillian(),
            bug_features.bug_reporter(),
            bug_features.blocked_bugs_number(),
            bug_features.priority(),
            bug_features.has_cve_in_alias(),
            bug_features.comment_count(),
            bug_features.comment_length(),
            bug_features.reporter_experience(),
            bug_features.number_of_bug_dependencies()
        ]

        cleanup_functions = [
            bug_features.cleanup_url,
            bug_features.cleanup_fileref,
            bug_features.cleanup_hex,
            bug_features.cleanup_dll,
            bug_features.cleanup_synonyms,
            bug_features.cleanup_crash,
        ]

        self.extraction_pipeline = Pipeline([
            ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback)),
            ('union', ColumnTransformer([
                ('data', DictVectorizer(), 'data'),

                ('title', self.text_vectorizer(min_df=0.0001), 'title'),

                ('comments', self.text_vectorizer(min_df=0.0001), 'comments'),
            ])),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor='cpu_predictor')
Esempio n. 8
0
    def __init__(self, lemmatization=False):
        BugModel.__init__(self, lemmatization)

        self.sampler = RandomUnderSampler(random_state=0)
        self.calculate_importance = False

        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.product(),
            bug_features.component(),
            bug_features.num_words_title(),
            bug_features.num_words_comments(),
            bug_features.keywords(),
        ]

        cleanup_functions = [
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline(
            [
                (
                    "bug_extractor",
                    bug_features.BugExtractor(
                        feature_extractors, cleanup_functions, rollback=True
                    ),
                ),
                (
                    "union",
                    ColumnTransformer(
                        [
                            ("data", DictVectorizer(), "data"),
                            ("title", self.text_vectorizer(), "title"),
                            ("comments", self.text_vectorizer(), "comments"),
                        ]
                    ),
                ),
            ]
        )

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
Esempio n. 9
0
    def __init__(self, lemmatization=False):
        Model.__init__(self, lemmatization)

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.keywords({"qawanted"}),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
        ]

        cleanup_functions = [
            bug_features.cleanup_fileref,
            bug_features.cleanup_url,
            bug_features.cleanup_synonyms,
        ]

        self.extraction_pipeline = Pipeline(
            [
                (
                    "bug_extractor",
                    bug_features.BugExtractor(
                        feature_extractors,
                        cleanup_functions,
                        rollback=True,
                        rollback_when=self.rollback,
                    ),
                ),
                (
                    "union",
                    ColumnTransformer(
                        [
                            ("data", DictVectorizer(), "data"),
                            ("title", self.text_vectorizer(), "title"),
                            ("comments", self.text_vectorizer(), "comments"),
                        ]
                    ),
                ),
            ]
        )

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
Esempio n. 10
0
    def __init__(self, lemmatization=False):
        Model.__init__(self, lemmatization)

        self.sampler = BorderlineSMOTE(random_state=0)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.severity(),
            # Ignore keywords that would make the ML completely skewed
            # (we are going to use them as 100% rules in the evaluation phase).
            bug_features.keywords(
                {'regression', 'talos-regression', 'feature'}),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
            bug_features.blocked_bugs_number(),
            bug_features.ever_affected(),
            bug_features.affected_then_unaffected(),
            bug_features.product(),
            bug_features.component(),
        ]

        cleanup_functions = [
            bug_features.cleanup_url,
            bug_features.cleanup_fileref,
            bug_features.cleanup_synonyms,
        ]

        self.extraction_pipeline = Pipeline([
            ('bug_extractor',
             bug_features.BugExtractor(feature_extractors, cleanup_functions)),
            ('union',
             ColumnTransformer([
                 ('data', DictVectorizer(), 'data'),
                 ('title', self.text_vectorizer(min_df=0.001), 'title'),
                 ('first_comment', self.text_vectorizer(min_df=0.001),
                  'first_comment'),
                 ('comments', self.text_vectorizer(min_df=0.001), 'comments'),
             ])),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor='cpu_predictor')
Esempio n. 11
0
    def __init__(self, lemmatization=False):
        Model.__init__(self, lemmatization)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.keywords({'dev-doc-needed', 'dev-doc-complete'}),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
            bug_features.product(),
            bug_features.component(),
            bug_features.commit_added(),
            bug_features.commit_deleted(),
            bug_features.commit_types(),
        ]

        cleanup_functions = [
            bug_features.cleanup_fileref,
            bug_features.cleanup_url,
            bug_features.cleanup_synonyms,
        ]

        self.extraction_pipeline = Pipeline([
            ('bug_extractor',
             bug_features.BugExtractor(feature_extractors,
                                       cleanup_functions,
                                       rollback=True,
                                       rollback_when=self.rollback,
                                       commit_data=True)),
            ('union',
             ColumnTransformer([
                 ('data', DictVectorizer(), 'data'),
                 ('title', self.text_vectorizer(stop_words='english'),
                  'title'),
                 ('comments', self.text_vectorizer(stop_words='english'),
                  'comments'),
             ])),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor='cpu_predictor')
Esempio n. 12
0
    def __init__(self, lemmatization=False):
        Model.__init__(self, lemmatization)

        self.undersampling_enabled = False
        self.cross_validation_enabled = False

        feature_extractors = [
            bug_features.has_str(),
            bug_features.severity(),
            bug_features.keywords(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
        ]

        cleanup_functions = [
            bug_features.cleanup_fileref,
            bug_features.cleanup_url,
            bug_features.cleanup_synonyms,
        ]

        self.extraction_pipeline = Pipeline([
            ('bug_extractor',
             bug_features.BugExtractor(feature_extractors, cleanup_functions)),
            (
                'union',
                ColumnTransformer([
                    # TODO: Re-enable when we'll support bug snapshotting (#5).
                    # ('data', DictVectorizer(), 'data'),
                    ('title', self.text_vectorizer(stop_words='english'),
                     'title'),

                    # TODO: Re-enable when we'll support bug snapshotting (#5).
                    # ('comments', self.text_vectorizer(stop_words='english'), 'comments'),
                    ('first_comment',
                     self.text_vectorizer(stop_words='english'),
                     'first_comment'),
                ])),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor='cpu_predictor')
Esempio n. 13
0
    def __init__(self, lemmatization=False):
        BugModel.__init__(self, lemmatization)

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.keywords({"stepswanted"}),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
        ]

        cleanup_functions = [
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline([
            (
                "bug_extractor",
                bug_features.BugExtractor(feature_extractors,
                                          cleanup_functions),
            ),
            (
                "union",
                ColumnTransformer([
                    ("data", DictVectorizer(), "data"),
                    ("title", self.text_vectorizer(), "title"),
                    ("comments", self.text_vectorizer(), "comments"),
                ]),
            ),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
        self.clf.set_params(predictor="cpu_predictor")
Esempio n. 14
0
    def __init__(self, lemmatization=False):
        Model.__init__(self, lemmatization)

        self.cross_validation_enabled = False
        self.calculate_importance = False

        feature_extractors = [
            bug_features.has_str(),
            bug_features.severity(),
            bug_features.keywords(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
        ]

        cleanup_functions = [
            bug_features.cleanup_fileref,
            bug_features.cleanup_url,
            bug_features.cleanup_synonyms,
        ]

        self.extraction_pipeline = Pipeline([
            ('bug_extractor',
             bug_features.BugExtractor(feature_extractors,
                                       cleanup_functions,
                                       rollback=True)),
            ('union',
             ColumnTransformer([
                 ('data', DictVectorizer(), 'data'),
                 ('title', self.text_vectorizer(min_df=0.0001), 'title'),
                 ('comments', self.text_vectorizer(min_df=0.0001), 'comments'),
             ])),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor='cpu_predictor')
Esempio n. 15
0
    def __init__(self, lemmatization=False):
        Model.__init__(self, lemmatization)

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.keywords(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
        ]

        cleanup_functions = [
            bug_features.cleanup_fileref,
            bug_features.cleanup_url,
            bug_features.cleanup_synonyms,
        ]

        self.extraction_pipeline = Pipeline([
            ('bug_extractor',
             bug_features.BugExtractor(feature_extractors, cleanup_functions)),
            ('union',
             ColumnTransformer([
                 ('data', DictVectorizer(), 'data'),
                 ('title', self.text_vectorizer(), 'title'),
                 ('comments', self.text_vectorizer(), 'comments'),
             ])),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor='cpu_predictor')
Esempio n. 16
0
    def __init__(self,
                 training_size=14000,
                 lemmatization=False,
                 cleanup_urls=True):
        self.num_duplicates = training_size // 2
        self.num_nondups_nondups = self.num_dup_nondups = training_size // 4

        BugCoupleModel.__init__(self, lemmatization)

        self.calculate_importance = False

        feature_extractors = [bug_features.is_same_product()]

        cleanup_functions = [
            feature_cleanup.responses(),
            feature_cleanup.hex(),
            feature_cleanup.dll(),
            feature_cleanup.fileref(),
            feature_cleanup.synonyms(),
            feature_cleanup.crash(),
        ]

        if cleanup_urls:
            cleanup_functions.append(feature_cleanup.url())

        self.extraction_pipeline = Pipeline([
            (
                "bug_extractor",
                bug_features.BugExtractor(feature_extractors,
                                          cleanup_functions),
            ),
            (
                "union",
                ColumnTransformer([("text", self.text_vectorizer(), "text")]),
            ),
        ])

        self.clf = LinearSVCWithLabelEncoding(LinearSVC())
Esempio n. 17
0
    def __init__(self,
                 lemmatization=False,
                 historical=False,
                 rca_subcategories_enabled=False):
        BugModel.__init__(self, lemmatization)

        self.calculate_importance = False
        self.rca_subcategories_enabled = rca_subcategories_enabled

        # should we consider only the main category or all sub categories
        self.RCA_TYPES = (RCA_SUBCATEGORIES + RCA_CATEGORIES
                          if rca_subcategories_enabled else RCA_CATEGORIES)

        self.RCA_LIST = sorted(set(self.RCA_TYPES))

        feature_extractors = [
            bug_features.has_str(),
            bug_features.severity(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            # Ignore whiteboards that would make the ML completely skewed
            # bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.blocked_bugs_number(),
            bug_features.ever_affected(),
            bug_features.affected_then_unaffected(),
            bug_features.product(),
            bug_features.component(),
        ]

        cleanup_functions = [
            feature_cleanup.url(),
            feature_cleanup.fileref(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline([
            (
                "bug_extractor",
                bug_features.BugExtractor(feature_extractors,
                                          cleanup_functions),
            ),
            (
                "union",
                ColumnTransformer([
                    ("data", DictVectorizer(), "data"),
                    ("title", self.text_vectorizer(min_df=0.001), "title"),
                    (
                        "first_comment",
                        self.text_vectorizer(min_df=0.001),
                        "first_comment",
                    ),
                    (
                        "comments",
                        self.text_vectorizer(min_df=0.001),
                        "comments",
                    ),
                ]),
            ),
        ])

        self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
Esempio n. 18
0
    def __init__(self, lemmatization=False):
        BugModel.__init__(self, lemmatization)

        self.calculate_importance = False

        self.sampler = InstanceHardnessThreshold(random_state=0)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.keywords(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
            bug_features.product(),
            bug_features.component(),
            bug_features.is_mozillian(),
            bug_features.bug_reporter(),
            bug_features.blocked_bugs_number(),
            bug_features.priority(),
            bug_features.has_cve_in_alias(),
            bug_features.comment_count(),
            bug_features.comment_length(),
            bug_features.reporter_experience(),
            bug_features.number_of_bug_dependencies(),
        ]

        cleanup_functions = [
            feature_cleanup.url(),
            feature_cleanup.fileref(),
            feature_cleanup.hex(),
            feature_cleanup.dll(),
            feature_cleanup.synonyms(),
            feature_cleanup.crash(),
        ]

        self.extraction_pipeline = Pipeline([
            (
                "bug_extractor",
                bug_features.BugExtractor(
                    feature_extractors,
                    cleanup_functions,
                    rollback=True,
                    rollback_when=self.rollback,
                ),
            ),
            (
                "union",
                ColumnTransformer([
                    ("data", DictVectorizer(), "data"),
                    ("title", self.text_vectorizer(min_df=0.0001), "title"),
                    (
                        "comments",
                        self.text_vectorizer(min_df=0.0001),
                        "comments",
                    ),
                ]),
            ),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
Esempio n. 19
0
    def __init__(self, lemmatization=False, historical=False):
        BugModel.__init__(self, lemmatization)

        self.sampler = BorderlineSMOTE(random_state=0)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.severity(),
            # Ignore keywords that would make the ML completely skewed
            # (we are going to use them as 100% rules in the evaluation phase).
            bug_features.keywords(
                {"regression", "talos-regression", "feature"}),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
            bug_features.blocked_bugs_number(),
            bug_features.ever_affected(),
            bug_features.affected_then_unaffected(),
            bug_features.product(),
            bug_features.component(),
        ]

        if historical:
            feature_extractors.append(bug_features.had_severity_enhancement())

        cleanup_functions = [
            feature_cleanup.url(),
            feature_cleanup.fileref(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline([
            (
                "bug_extractor",
                bug_features.BugExtractor(feature_extractors,
                                          cleanup_functions),
            ),
            (
                "union",
                ColumnTransformer([
                    ("data", DictVectorizer(), "data"),
                    ("title", self.text_vectorizer(min_df=0.001), "title"),
                    (
                        "first_comment",
                        self.text_vectorizer(min_df=0.001),
                        "first_comment",
                    ),
                    (
                        "comments",
                        self.text_vectorizer(min_df=0.001),
                        "comments",
                    ),
                ]),
            ),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
Esempio n. 20
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.short_desc_maxlen = 20
        self.short_desc_vocab_size = 25000
        self.short_desc_emb_sz = 300
        self.long_desc_maxlen = 100
        self.long_desc_vocab_size = 25000
        self.long_desc_emb_sz = 300
        self.cross_validation_enabled = False

        self.params = [
            {
                "short_desc_emb_dropout_rate": 0.2,
                "short_desc_encoded_gru_units": 256,
                "short_desc_encoded_gru_dropout": 0.45,
                "short_desc_encoded_recurrent_dropout": 0.5,
                "long_desc_emb_dropout_rate": 0.25,
                "long_desc_encoded_gru_units": 256,
                "long_desc_encoded_dropout": 0.5,
                "long_desc_encoded_recurrent_dropout": 0.55,
                "rep_platform_emb_input_dim": 14,
                "rep_platform_emb_output_dim": 25,
                "rep_platform_emb_spatial_dropout_rate": 0.1,
                "rep_platform_emb_dropout_rate": 0.45,
                "op_sys_emb_input_dim": 48,
                "op_sys_emb_output_dim": 50,
                "op_sys_emb_spatial_dropout_rate": 0.1,
                "op_sys_emb_dropout_rate": 0.45,
                "reporter_emb_input_dim": 46544,
                "reporter_emb_output_dim": 100,
                "reporter_emb_spatial_dropout_rate": 0.15,
                "reporter_emb_dropout_rate": 0.5,
                "tfidf_word_dense_units": 600,
                "tfidf_word_dropout_rate": 0.5,
                "tfidf_char_inp_dense_unit": 500,
                "tfidf_char_inp_dropout_rate": 0.5,
                "x_dense_unit": 2000,
                "x_dropout_rate": 0.6,
            },
            {
                "short_desc_emb_dropout_rate": 0.2,
                "short_desc_encoded_gru_units": 250,
                "short_desc_encoded_gru_dropout": 0.45,
                "short_desc_encoded_recurrent_dropout": 0.45,
                "long_desc_emb_dropout_rate": 0.25,
                "long_desc_encoded_gru_units": 250,
                "long_desc_encoded_dropout": 0.45,
                "long_desc_encoded_recurrent_dropout": 0.45,
                "rep_platform_emb_input_dim": 14,
                "rep_platform_emb_output_dim": 30,
                "rep_platform_emb_spatial_dropout_rate": 0.1,
                "rep_platform_emb_dropout_rate": 0.4,
                "op_sys_emb_input_dim": 48,
                "op_sys_emb_output_dim": 55,
                "op_sys_emb_spatial_dropout_rate": 0.1,
                "op_sys_emb_dropout_rate": 0.4,
                "reporter_emb_input_dim": 46544,
                "reporter_emb_output_dim": 110,
                "reporter_emb_spatial_dropout_rate": 0.15,
                "reporter_emb_dropout_rate": 0.45,
                "tfidf_word_dense_units": 610,
                "tfidf_word_dropout_rate": 0.45,
                "tfidf_char_inp_dense_unit": 510,
                "tfidf_char_inp_dropout_rate": 0.5,
                "x_dense_unit": 1970,
                "x_dropout_rate": 0.5,
            },
            {
                "short_desc_emb_dropout_rate": 0.2,
                "short_desc_encoded_gru_units": 266,
                "short_desc_encoded_gru_dropout": 0.45,
                "short_desc_encoded_recurrent_dropout": 0.45,
                "long_desc_emb_dropout_rate": 0.25,
                "long_desc_encoded_gru_units": 266,
                "long_desc_encoded_dropout": 0.45,
                "long_desc_encoded_recurrent_dropout": 0.55,
                "rep_platform_emb_input_dim": 14,
                "rep_platform_emb_output_dim": 35,
                "rep_platform_emb_spatial_dropout_rate": 0.1,
                "rep_platform_emb_dropout_rate": 0.45,
                "op_sys_emb_input_dim": 48,
                "op_sys_emb_output_dim": 60,
                "op_sys_emb_spatial_dropout_rate": 0.1,
                "op_sys_emb_dropout_rate": 0.45,
                "reporter_emb_input_dim": 46544,
                "reporter_emb_output_dim": 120,
                "reporter_emb_spatial_dropout_rate": 0.15,
                "reporter_emb_dropout_rate": 0.45,
                "tfidf_word_dense_units": 620,
                "tfidf_word_dropout_rate": 0.5,
                "tfidf_char_inp_dense_unit": 520,
                "tfidf_char_inp_dropout_rate": 0.45,
                "x_dense_unit": 1950,
                "x_dropout_rate": 0.5,
            },
        ]

        feature_extractors = [
            bug_features.bug_reporter(),
            bug_features.platform(),
            bug_features.op_sys(),
        ]

        cleanup_functions = []

        self.extraction_pipeline = Pipeline([
            (
                "bug_extractor",
                bug_features.BugExtractor(feature_extractors,
                                          cleanup_functions),
            ),
            (
                "union",
                StructuredColumnTransformer([
                    (
                        "platform",
                        make_pipeline(DictExtractor("platform"),
                                      OrdinalEncoder()),
                        "data",
                    ),
                    (
                        "op_sys",
                        make_pipeline(DictExtractor("op_sys"),
                                      OrdinalEncoder()),
                        "data",
                    ),
                    (
                        "bug_reporter",
                        make_pipeline(
                            DictExtractor("bug_reporter"),
                            MissingOrdinalEncoder(),
                        ),
                        "data",
                    ),
                    (
                        "title_sequence",
                        KerasTextToSequences(self.short_desc_maxlen,
                                             self.short_desc_vocab_size),
                        "title",
                    ),
                    (
                        "first_comment_sequence",
                        KerasTextToSequences(self.long_desc_maxlen,
                                             self.long_desc_vocab_size),
                        "first_comment",
                    ),
                    (
                        "title_char_tfidf",
                        TfidfVectorizer(
                            strip_accents="unicode",
                            analyzer="char",
                            stop_words="english",
                            ngram_range=(2, 4),
                            max_features=25000,
                            sublinear_tf=True,
                        ),
                        "title",
                    ),
                    (
                        "title_word_tfidf",
                        TfidfVectorizer(
                            strip_accents="unicode",
                            min_df=0.0001,
                            max_df=0.1,
                            analyzer="word",
                            token_pattern=r"\w{1,}",
                            stop_words="english",
                            ngram_range=(2, 4),
                            max_features=30000,
                            sublinear_tf=True,
                        ),
                        "title",
                    ),
                ]),
            ),
        ])

        kwargs = {
            "short_desc_maxlen": self.short_desc_maxlen,
            "short_desc_vocab_size": self.short_desc_vocab_size,
            "short_desc_emb_sz": self.short_desc_emb_sz,
            "long_desc_maxlen": self.long_desc_maxlen,
            "long_desc_vocab_size": self.long_desc_vocab_size,
            "long_desc_emb_sz": self.long_desc_emb_sz,
        }

        estimators = []
        for i, params in enumerate(self.params):
            kwargs["params"] = params
            estimator = ComponentNNClassifier(**kwargs)
            estimators.append(("model_{}".format(i), estimator))

        self.clf = VotingClassifier(estimators=estimators,
                                    voting="soft",
                                    weights=[1, 1, 1])
Esempio n. 21
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.short_desc_maxlen = 20
        self.short_desc_vocab_size = 25000
        self.short_desc_emb_sz = 300
        self.long_desc_maxlen = 100
        self.long_desc_vocab_size = 25000
        self.long_desc_emb_sz = 300
        self.cross_validation_enabled = False

        feature_extractors = [
            bug_features.bug_reporter(),
            bug_features.platform(),
            bug_features.op_sys()
        ]

        cleanup_functions = []

        self.extraction_pipeline = Pipeline([
            ('bug_extractor',
             bug_features.BugExtractor(feature_extractors, cleanup_functions)),
            ('union',
             StructuredColumnTransformer([
                 ('platform',
                  make_pipeline(DictExtractor('platform'),
                                OrdinalEncoder()), 'data'),
                 ('op_sys',
                  make_pipeline(DictExtractor('op_sys'),
                                OrdinalEncoder()), 'data'),
                 ('bug_reporter',
                  make_pipeline(DictExtractor('bug_reporter'),
                                OrdinalEncoder()), 'data'),
                 ('title_sequence',
                  KerasTextToSequences(self.short_desc_maxlen,
                                       self.short_desc_vocab_size), 'title'),
                 ('first_comment_sequence',
                  KerasTextToSequences(self.long_desc_maxlen,
                                       self.long_desc_vocab_size),
                  'first_comment'),
                 ('title_char_tfidf',
                  TfidfVectorizer(strip_accents='unicode',
                                  analyzer='char',
                                  stop_words='english',
                                  ngram_range=(2, 4),
                                  max_features=25000,
                                  sublinear_tf=True), 'title'),
                 ('title_word_tfidf',
                  TfidfVectorizer(strip_accents='unicode',
                                  min_df=0.0001,
                                  max_df=0.1,
                                  analyzer='word',
                                  token_pattern=r'\w{1,}',
                                  stop_words='english',
                                  ngram_range=(2, 4),
                                  max_features=30000,
                                  sublinear_tf=True), 'title')
             ])),
        ])

        kwargs = {
            'short_desc_maxlen': self.short_desc_maxlen,
            'short_desc_vocab_size': self.short_desc_vocab_size,
            'short_desc_emb_sz': self.short_desc_emb_sz,
            'long_desc_maxlen': self.long_desc_maxlen,
            'long_desc_vocab_size': self.long_desc_vocab_size,
            'long_desc_emb_sz': self.long_desc_emb_sz
        }
        self.clf = ComponentNNClassifier(**kwargs)
Esempio n. 22
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.short_desc_maxlen = 20
        self.short_desc_vocab_size = 25000
        self.short_desc_emb_sz = 300
        self.long_desc_maxlen = 100
        self.long_desc_vocab_size = 25000
        self.long_desc_emb_sz = 300
        self.cross_validation_enabled = False

        self.params = [{
            'short_desc_emb_dropout_rate': 0.2,
            'short_desc_encoded_gru_units': 256,
            'short_desc_encoded_gru_dropout': 0.45,
            'short_desc_encoded_recurrent_dropout': 0.5,
            'long_desc_emb_dropout_rate': 0.25,
            'long_desc_encoded_gru_units': 256,
            'long_desc_encoded_dropout': 0.5,
            'long_desc_encoded_recurrent_dropout': 0.55,
            'rep_platform_emb_input_dim': 14,
            'rep_platform_emb_output_dim': 25,
            'rep_platform_emb_spatial_dropout_rate': 0.1,
            'rep_platform_emb_dropout_rate': 0.45,
            'op_sys_emb_input_dim': 48,
            'op_sys_emb_output_dim': 50,
            'op_sys_emb_spatial_dropout_rate': 0.1,
            'op_sys_emb_dropout_rate': 0.45,
            'reporter_emb_input_dim': 46544,
            'reporter_emb_output_dim': 100,
            'reporter_emb_spatial_dropout_rate': 0.15,
            'reporter_emb_dropout_rate': 0.5,
            'tfidf_word_dense_units': 600,
            'tfidf_word_dropout_rate': 0.5,
            'tfidf_char_inp_dense_unit': 500,
            'tfidf_char_inp_dropout_rate': 0.5,
            'x_dense_unit': 2000,
            'x_dropout_rate': 0.6,
        }, {
            'short_desc_emb_dropout_rate': 0.2,
            'short_desc_encoded_gru_units': 250,
            'short_desc_encoded_gru_dropout': 0.45,
            'short_desc_encoded_recurrent_dropout': 0.45,
            'long_desc_emb_dropout_rate': 0.25,
            'long_desc_encoded_gru_units': 250,
            'long_desc_encoded_dropout': 0.45,
            'long_desc_encoded_recurrent_dropout': 0.45,
            'rep_platform_emb_input_dim': 14,
            'rep_platform_emb_output_dim': 30,
            'rep_platform_emb_spatial_dropout_rate': 0.1,
            'rep_platform_emb_dropout_rate': 0.4,
            'op_sys_emb_input_dim': 48,
            'op_sys_emb_output_dim': 55,
            'op_sys_emb_spatial_dropout_rate': 0.1,
            'op_sys_emb_dropout_rate': 0.4,
            'reporter_emb_input_dim': 46544,
            'reporter_emb_output_dim': 110,
            'reporter_emb_spatial_dropout_rate': 0.15,
            'reporter_emb_dropout_rate': 0.45,
            'tfidf_word_dense_units': 610,
            'tfidf_word_dropout_rate': 0.45,
            'tfidf_char_inp_dense_unit': 510,
            'tfidf_char_inp_dropout_rate': 0.5,
            'x_dense_unit': 1970,
            'x_dropout_rate': 0.5,
        }, {
            'short_desc_emb_dropout_rate': 0.2,
            'short_desc_encoded_gru_units': 266,
            'short_desc_encoded_gru_dropout': 0.45,
            'short_desc_encoded_recurrent_dropout': 0.45,
            'long_desc_emb_dropout_rate': 0.25,
            'long_desc_encoded_gru_units': 266,
            'long_desc_encoded_dropout': 0.45,
            'long_desc_encoded_recurrent_dropout': 0.55,
            'rep_platform_emb_input_dim': 14,
            'rep_platform_emb_output_dim': 35,
            'rep_platform_emb_spatial_dropout_rate': 0.1,
            'rep_platform_emb_dropout_rate': 0.45,
            'op_sys_emb_input_dim': 48,
            'op_sys_emb_output_dim': 60,
            'op_sys_emb_spatial_dropout_rate': 0.1,
            'op_sys_emb_dropout_rate': 0.45,
            'reporter_emb_input_dim': 46544,
            'reporter_emb_output_dim': 120,
            'reporter_emb_spatial_dropout_rate': 0.15,
            'reporter_emb_dropout_rate': 0.45,
            'tfidf_word_dense_units': 620,
            'tfidf_word_dropout_rate': 0.5,
            'tfidf_char_inp_dense_unit': 520,
            'tfidf_char_inp_dropout_rate': 0.45,
            'x_dense_unit': 1950,
            'x_dropout_rate': 0.5,
        }]

        feature_extractors = [
            bug_features.bug_reporter(),
            bug_features.platform(),
            bug_features.op_sys()
        ]

        cleanup_functions = []

        self.extraction_pipeline = Pipeline([
            ('bug_extractor',
             bug_features.BugExtractor(feature_extractors, cleanup_functions)),
            ('union',
             StructuredColumnTransformer([
                 ('platform',
                  make_pipeline(DictExtractor('platform'),
                                OrdinalEncoder()), 'data'),
                 ('op_sys',
                  make_pipeline(DictExtractor('op_sys'),
                                OrdinalEncoder()), 'data'),
                 ('bug_reporter',
                  make_pipeline(DictExtractor('bug_reporter'),
                                MissingOrdinalEncoder()), 'data'),
                 ('title_sequence',
                  KerasTextToSequences(self.short_desc_maxlen,
                                       self.short_desc_vocab_size), 'title'),
                 ('first_comment_sequence',
                  KerasTextToSequences(self.long_desc_maxlen,
                                       self.long_desc_vocab_size),
                  'first_comment'),
                 ('title_char_tfidf',
                  TfidfVectorizer(strip_accents='unicode',
                                  analyzer='char',
                                  stop_words='english',
                                  ngram_range=(2, 4),
                                  max_features=25000,
                                  sublinear_tf=True), 'title'),
                 ('title_word_tfidf',
                  TfidfVectorizer(strip_accents='unicode',
                                  min_df=0.0001,
                                  max_df=0.1,
                                  analyzer='word',
                                  token_pattern=r'\w{1,}',
                                  stop_words='english',
                                  ngram_range=(2, 4),
                                  max_features=30000,
                                  sublinear_tf=True), 'title')
             ])),
        ])

        kwargs = {
            'short_desc_maxlen': self.short_desc_maxlen,
            'short_desc_vocab_size': self.short_desc_vocab_size,
            'short_desc_emb_sz': self.short_desc_emb_sz,
            'long_desc_maxlen': self.long_desc_maxlen,
            'long_desc_vocab_size': self.long_desc_vocab_size,
            'long_desc_emb_sz': self.long_desc_emb_sz
        }

        estimators = []
        for i, params in enumerate(self.params):
            kwargs['params'] = params
            estimator = ComponentNNClassifier(**kwargs)
            estimators.append(('model_{}'.format(i), estimator))

        self.clf = VotingClassifier(estimators=estimators,
                                    voting='soft',
                                    weights=[1, 1, 1])