def __init__(self, lemmatization=False, interpretable=False): CommitModel.__init__(self, lemmatization) self.required_dbs.append(BUG_INTRODUCING_COMMITS_DB) self.store_dataset = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.functions_touched_num(), commit_features.functions_touched_size(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] column_transformers = [("data", DictVectorizer(), "data")] if not interpretable: column_transformers.append( ("desc", self.text_vectorizer(min_df=0.0001), "desc")) self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, cleanup_functions), ), ("union", ColumnTransformer(column_transformers)), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization: bool = False) -> None: CommitModel.__init__(self, lemmatization) self.calculate_importance = False self.training_dbs += [bugzilla.BUGS_DB] self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.reviewers_num(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.functions_touched_num(), commit_features.functions_touched_size(), commit_features.source_code_file_metrics(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(min_df=0.0001), "desc"), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): CommitModel.__init__(self, lemmatization) self.calculate_importance = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.file_size(), commit_features.test_added(), commit_features.added(), commit_features.deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.files(), commit_features.files_modified_num(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(), "desc"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): CommitModel.__init__(self, lemmatization) self.calculate_importance = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.files_modified_num(), commit_features.test_added(), commit_features.added(), commit_features.deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.author_experience_90_days(), commit_features.reviewer_experience(), commit_features.reviewer_experience_90_days(), commit_features.components_touched_prev(), commit_features.components_touched_prev_90_days(), commit_features.files_touched_prev(), commit_features.files_touched_prev_90_days(), commit_features.types(), commit_features.components(), commit_features.number_of_reviewers(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(), "desc"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): CommitModel.__init__(self, lemmatization) self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), # commit_features.author_experience(), # commit_features.reviewer_experience(), commit_features.reviewers_num(), # commit_features.component_touched_prev(), # commit_features.directory_touched_prev(), # commit_features.file_touched_prev(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), ] self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, bug_data=False): CommitModel.__init__(self, lemmatization, bug_data) self.calculate_importance = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.components(), commit_features.directories(), commit_features.files(), ] if bug_data: feature_extractors += [ bug_features.product(), bug_features.component(), bug_features.severity(), bug_features.priority(), bug_features.has_crash_signature(), bug_features.has_regression_range(), bug_features.whiteboard(), bug_features.keywords(), bug_features.number_of_bug_dependencies(), bug_features.blocked_bugs_number(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(), "desc"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__( self, lemmatization: bool = False, interpretable: bool = True, use_finder: bool = False, exclude_finder: bool = True, finder_regressions_only: bool = False, ) -> None: CommitModel.__init__(self, lemmatization) self.training_dbs += [BUG_INTRODUCING_COMMITS_DB, bugzilla.BUGS_DB] if finder_regressions_only: self.training_dbs.append(BUG_FIXING_COMMITS_DB) self.store_dataset = True self.sampler = RandomUnderSampler(random_state=0) self.use_finder = use_finder self.exclude_finder = exclude_finder assert ( use_finder ^ exclude_finder ), "Using both use_finder and exclude_finder option does not make a lot of sense" self.finder_regressions_only = finder_regressions_only feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.functions_touched_num(), commit_features.functions_touched_size(), commit_features.source_code_file_metrics(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] column_transformers = [("data", DictVectorizer(), "data")] if not interpretable: column_transformers.append( ("desc", self.text_vectorizer(min_df=0.0001), "desc") ) self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ("union", ColumnTransformer(column_transformers)), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")