Ejemplo n.º 1
0
    def __init__(self,
                 lemmatization=False,
                 granularity="label",
                 failures_skip=None):
        Model.__init__(self, lemmatization)

        self.granularity = granularity
        self.failures_skip = failures_skip

        self.training_dbs = [repository.COMMITS_DB]
        self.eval_dbs[repository.COMMITS_DB] = (
            repository.COMMITS_DB,
            repository.COMMIT_EXPERIENCES_DB,
        )
        if granularity == "label":
            self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB)
            self.eval_dbs[test_scheduling.TEST_LABEL_SCHEDULING_DB] = (
                test_scheduling.PAST_FAILURES_LABEL_DB,
                test_scheduling.FAILING_TOGETHER_LABEL_DB,
            )
        elif granularity == "group":
            self.training_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB)
            self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = (
                test_scheduling.PAST_FAILURES_GROUP_DB,
                test_scheduling.TOUCHED_TOGETHER_DB,
            )

        self.cross_validation_enabled = False

        self.entire_dataset_training = True

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            test_scheduling_features.prev_failures(),
        ]

        if granularity == "label":
            feature_extractors += [
                test_scheduling_features.platform(),
                # test_scheduling_features.chunk(),
                test_scheduling_features.suite(),
            ]
        elif granularity == "group":
            feature_extractors += [
                test_scheduling_features.path_distance(),
                test_scheduling_features.common_path_components(),
                test_scheduling_features.touched_together(),
            ]

        self.extraction_pipeline = Pipeline([
            (
                "commit_extractor",
                commit_features.CommitExtractor(feature_extractors, []),
            ),
            ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
Ejemplo n.º 2
0
    def __init__(self, lemmatization=False, interpretable=False):
        CommitModel.__init__(self, lemmatization)

        self.required_dbs.append(BUG_INTRODUCING_COMMITS_DB)

        self.store_dataset = True
        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            commit_features.source_code_file_size(),
            commit_features.other_file_size(),
            commit_features.test_file_size(),
            commit_features.source_code_added(),
            commit_features.other_added(),
            commit_features.test_added(),
            commit_features.source_code_deleted(),
            commit_features.other_deleted(),
            commit_features.test_deleted(),
            commit_features.author_experience(),
            commit_features.reviewer_experience(),
            commit_features.reviewers_num(),
            commit_features.component_touched_prev(),
            commit_features.directory_touched_prev(),
            commit_features.file_touched_prev(),
            commit_features.types(),
            commit_features.files(),
            commit_features.components(),
            commit_features.components_modified_num(),
            commit_features.directories(),
            commit_features.directories_modified_num(),
            commit_features.source_code_files_modified_num(),
            commit_features.other_files_modified_num(),
            commit_features.test_files_modified_num(),
            commit_features.functions_touched_num(),
            commit_features.functions_touched_size(),
        ]

        cleanup_functions = [
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
        ]

        column_transformers = [("data", DictVectorizer(), "data")]

        if not interpretable:
            column_transformers.append(
                ("desc", self.text_vectorizer(min_df=0.0001), "desc"))

        self.extraction_pipeline = Pipeline([
            (
                "commit_extractor",
                commit_features.CommitExtractor(feature_extractors,
                                                cleanup_functions),
            ),
            ("union", ColumnTransformer(column_transformers)),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
Ejemplo n.º 3
0
    def __init__(self, lemmatization: bool = False) -> None:
        CommitModel.__init__(self, lemmatization)

        self.calculate_importance = False

        self.training_dbs += [bugzilla.BUGS_DB]

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            commit_features.source_code_file_size(),
            commit_features.other_file_size(),
            commit_features.test_file_size(),
            commit_features.source_code_added(),
            commit_features.other_added(),
            commit_features.test_added(),
            commit_features.source_code_deleted(),
            commit_features.other_deleted(),
            commit_features.test_deleted(),
            commit_features.reviewers_num(),
            commit_features.types(),
            commit_features.files(),
            commit_features.components(),
            commit_features.components_modified_num(),
            commit_features.directories(),
            commit_features.directories_modified_num(),
            commit_features.source_code_files_modified_num(),
            commit_features.other_files_modified_num(),
            commit_features.test_files_modified_num(),
            commit_features.functions_touched_num(),
            commit_features.functions_touched_size(),
            commit_features.source_code_file_metrics(),
        ]

        cleanup_functions = [
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline([
            (
                "commit_extractor",
                commit_features.CommitExtractor(feature_extractors,
                                                cleanup_functions),
            ),
            (
                "union",
                ColumnTransformer([
                    ("data", DictVectorizer(), "data"),
                    ("desc", self.text_vectorizer(min_df=0.0001), "desc"),
                ]),
            ),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
        self.clf.set_params(predictor="cpu_predictor")
Ejemplo n.º 4
0
    def __init__(self, lemmatization=False):
        CommitModel.__init__(self, lemmatization)

        self.calculate_importance = False

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            commit_features.file_size(),
            commit_features.test_added(),
            commit_features.added(),
            commit_features.deleted(),
            commit_features.test_deleted(),
            commit_features.author_experience(),
            commit_features.reviewer_experience(),
            commit_features.reviewers_num(),
            commit_features.component_touched_prev(),
            commit_features.directory_touched_prev(),
            commit_features.file_touched_prev(),
            commit_features.types(),
            commit_features.components(),
            commit_features.components_modified_num(),
            commit_features.directories(),
            commit_features.directories_modified_num(),
            commit_features.files(),
            commit_features.files_modified_num(),
        ]

        cleanup_functions = [
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline(
            [
                (
                    "commit_extractor",
                    commit_features.CommitExtractor(
                        feature_extractors, cleanup_functions
                    ),
                ),
                (
                    "union",
                    ColumnTransformer(
                        [
                            ("data", DictVectorizer(), "data"),
                            ("desc", self.text_vectorizer(), "desc"),
                        ]
                    ),
                ),
            ]
        )

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
Ejemplo n.º 5
0
    def __init__(self, lemmatization=False, granularity="label"):
        Model.__init__(self, lemmatization)

        self.granularity = granularity

        self.required_dbs = [repository.COMMITS_DB]
        if granularity == "label":
            self.required_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB)
        elif granularity == "group":
            self.required_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB)

        self.cross_validation_enabled = False

        self.entire_dataset_training = True

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            commit_features.source_code_files_modified_num(),
            commit_features.other_files_modified_num(),
            commit_features.test_files_modified_num(),
            commit_features.source_code_file_size(),
            commit_features.other_file_size(),
            commit_features.test_file_size(),
            commit_features.source_code_added(),
            commit_features.other_added(),
            commit_features.test_added(),
            commit_features.source_code_deleted(),
            commit_features.other_deleted(),
            commit_features.test_deleted(),
            test_scheduling_features.name(),
            test_scheduling_features.prev_failures(),
        ]

        if granularity == "label":
            feature_extractors += [
                test_scheduling_features.platform(),
                test_scheduling_features.chunk(),
                test_scheduling_features.suite(),
            ]
        elif granularity == "group":
            feature_extractors += [
                test_scheduling_features.path_distance(),
                test_scheduling_features.common_path_components(),
            ]

        self.extraction_pipeline = Pipeline([
            (
                "commit_extractor",
                commit_features.CommitExtractor(feature_extractors, []),
            ),
            ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
Ejemplo n.º 6
0
    def __init__(self, lemmatization=False):
        CommitModel.__init__(self, lemmatization)

        self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB)

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            commit_features.source_code_file_size(),
            commit_features.other_file_size(),
            commit_features.test_file_size(),
            commit_features.source_code_added(),
            commit_features.other_added(),
            commit_features.test_added(),
            commit_features.source_code_deleted(),
            commit_features.other_deleted(),
            commit_features.test_deleted(),
            # commit_features.author_experience(),
            # commit_features.reviewer_experience(),
            commit_features.reviewers_num(),
            # commit_features.component_touched_prev(),
            # commit_features.directory_touched_prev(),
            # commit_features.file_touched_prev(),
            commit_features.types(),
            commit_features.files(),
            commit_features.components(),
            commit_features.components_modified_num(),
            commit_features.directories(),
            commit_features.directories_modified_num(),
            commit_features.source_code_files_modified_num(),
            commit_features.other_files_modified_num(),
            commit_features.test_files_modified_num(),
        ]

        self.extraction_pipeline = Pipeline(
            [
                (
                    "commit_extractor",
                    commit_features.CommitExtractor(feature_extractors, []),
                ),
                ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
            ]
        )

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
Ejemplo n.º 7
0
    def __init__(self, lemmatization=False):
        Model.__init__(self, lemmatization)

        self.required_dbs = [
            repository.COMMITS_DB, test_scheduling.TEST_SCHEDULING_DB
        ]

        self.calculate_importance = False
        self.cross_validation_enabled = False

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            commit_features.source_code_files_modified_num(),
            commit_features.other_files_modified_num(),
            commit_features.test_files_modified_num(),
            commit_features.source_code_file_size(),
            commit_features.other_file_size(),
            commit_features.test_file_size(),
            commit_features.source_code_added(),
            commit_features.other_added(),
            commit_features.test_added(),
            commit_features.source_code_deleted(),
            commit_features.other_deleted(),
            commit_features.test_deleted(),
            test_scheduling_features.name(),
            test_scheduling_features.platform(),
            test_scheduling_features.chunk(),
            test_scheduling_features.suite(),
            test_scheduling_features.prev_failures(),
        ]

        self.extraction_pipeline = Pipeline([
            (
                "commit_extractor",
                commit_features.CommitExtractor(feature_extractors, []),
            ),
            ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
Ejemplo n.º 8
0
    def __init__(self, lemmatization=False, bug_data=False):
        CommitModel.__init__(self, lemmatization, bug_data)

        self.calculate_importance = False

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            commit_features.source_code_files_modified_num(),
            commit_features.other_files_modified_num(),
            commit_features.test_files_modified_num(),
            commit_features.source_code_file_size(),
            commit_features.other_file_size(),
            commit_features.test_file_size(),
            commit_features.source_code_added(),
            commit_features.other_added(),
            commit_features.test_added(),
            commit_features.source_code_deleted(),
            commit_features.other_deleted(),
            commit_features.test_deleted(),
            commit_features.author_experience(),
            commit_features.reviewer_experience(),
            commit_features.reviewers_num(),
            commit_features.component_touched_prev(),
            commit_features.directory_touched_prev(),
            commit_features.file_touched_prev(),
            commit_features.types(),
            commit_features.components(),
            commit_features.directories(),
            commit_features.files(),
        ]

        if bug_data:
            feature_extractors += [
                bug_features.product(),
                bug_features.component(),
                bug_features.severity(),
                bug_features.priority(),
                bug_features.has_crash_signature(),
                bug_features.has_regression_range(),
                bug_features.whiteboard(),
                bug_features.keywords(),
                bug_features.number_of_bug_dependencies(),
                bug_features.blocked_bugs_number(),
            ]

        cleanup_functions = [
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline(
            [
                (
                    "commit_extractor",
                    commit_features.CommitExtractor(
                        feature_extractors, cleanup_functions
                    ),
                ),
                (
                    "union",
                    ColumnTransformer(
                        [
                            ("data", DictVectorizer(), "data"),
                            ("desc", self.text_vectorizer(), "desc"),
                        ]
                    ),
                ),
            ]
        )

        self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
        self.clf.set_params(predictor="cpu_predictor")
Ejemplo n.º 9
0
    def __init__(
        self,
        lemmatization: bool = False,
        interpretable: bool = True,
        use_finder: bool = False,
        exclude_finder: bool = True,
        finder_regressions_only: bool = False,
    ) -> None:
        CommitModel.__init__(self, lemmatization)

        self.training_dbs += [BUG_INTRODUCING_COMMITS_DB, bugzilla.BUGS_DB]
        if finder_regressions_only:
            self.training_dbs.append(BUG_FIXING_COMMITS_DB)

        self.store_dataset = True
        self.sampler = RandomUnderSampler(random_state=0)

        self.use_finder = use_finder
        self.exclude_finder = exclude_finder
        assert (
            use_finder ^ exclude_finder
        ), "Using both use_finder and exclude_finder option does not make a lot of sense"
        self.finder_regressions_only = finder_regressions_only

        feature_extractors = [
            commit_features.source_code_file_size(),
            commit_features.other_file_size(),
            commit_features.test_file_size(),
            commit_features.source_code_added(),
            commit_features.other_added(),
            commit_features.test_added(),
            commit_features.source_code_deleted(),
            commit_features.other_deleted(),
            commit_features.test_deleted(),
            commit_features.author_experience(),
            commit_features.reviewer_experience(),
            commit_features.reviewers_num(),
            commit_features.component_touched_prev(),
            commit_features.directory_touched_prev(),
            commit_features.file_touched_prev(),
            commit_features.types(),
            commit_features.files(),
            commit_features.components(),
            commit_features.components_modified_num(),
            commit_features.directories(),
            commit_features.directories_modified_num(),
            commit_features.source_code_files_modified_num(),
            commit_features.other_files_modified_num(),
            commit_features.test_files_modified_num(),
            commit_features.functions_touched_num(),
            commit_features.functions_touched_size(),
            commit_features.source_code_file_metrics(),
        ]

        cleanup_functions = [
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
        ]

        column_transformers = [("data", DictVectorizer(), "data")]

        if not interpretable:
            column_transformers.append(
                ("desc", self.text_vectorizer(min_df=0.0001), "desc")
            )

        self.extraction_pipeline = Pipeline(
            [
                (
                    "commit_extractor",
                    commit_features.CommitExtractor(
                        feature_extractors, cleanup_functions
                    ),
                ),
                ("union", ColumnTransformer(column_transformers)),
            ]
        )

        self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
        self.clf.set_params(predictor="cpu_predictor")
Ejemplo n.º 10
0
    def __init__(self,
                 lemmatization=False,
                 granularity="label",
                 use_subset=False):
        Model.__init__(self, lemmatization)

        self.granularity = granularity
        # This is useful for development purposes, it avoids using too much memory
        # by using a subset of the dataset (dropping some passing runnables).
        self.use_subset = use_subset

        self.training_dbs = [repository.COMMITS_DB]
        self.eval_dbs[repository.COMMITS_DB] = (
            repository.COMMITS_DB,
            repository.COMMIT_EXPERIENCES_DB,
        )
        if granularity == "label":
            self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB)
            self.eval_dbs[test_scheduling.TEST_LABEL_SCHEDULING_DB] = (
                test_scheduling.PAST_FAILURES_LABEL_DB,
                test_scheduling.FAILING_TOGETHER_LABEL_DB,
            )
        elif granularity == "group":
            self.training_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB)
            self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = (
                test_scheduling.PAST_FAILURES_GROUP_DB,
                test_scheduling.TOUCHED_TOGETHER_DB,
            )

        self.cross_validation_enabled = False

        self.entire_dataset_training = True

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            test_scheduling_features.prev_failures(),
        ]

        if granularity == "label":
            feature_extractors += [
                test_scheduling_features.platform(),
                # test_scheduling_features.chunk(),
                test_scheduling_features.suite(),
            ]
        elif granularity == "group":
            feature_extractors += [
                commit_features.source_code_files_modified_num(),
                commit_features.other_files_modified_num(),
                commit_features.test_files_modified_num(),
                commit_features.source_code_file_size(),
                commit_features.other_file_size(),
                commit_features.test_file_size(),
                commit_features.source_code_added(),
                commit_features.other_added(),
                commit_features.test_added(),
                commit_features.source_code_deleted(),
                commit_features.other_deleted(),
                commit_features.test_deleted(),
                test_scheduling_features.path_distance(),
                test_scheduling_features.common_path_components(),
                test_scheduling_features.touched_together(),
            ]

        self.extraction_pipeline = Pipeline([
            (
                "commit_extractor",
                commit_features.CommitExtractor(feature_extractors, []),
            ),
            ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")