def __init__(self, lemmatization=False, granularity="label", failures_skip=None): Model.__init__(self, lemmatization) self.granularity = granularity self.failures_skip = failures_skip self.training_dbs = [repository.COMMITS_DB] self.eval_dbs[repository.COMMITS_DB] = ( repository.COMMITS_DB, repository.COMMIT_EXPERIENCES_DB, ) if granularity == "label": self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_LABEL_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_LABEL_DB, test_scheduling.FAILING_TOGETHER_LABEL_DB, ) elif granularity == "group": self.training_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_GROUP_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) self.cross_validation_enabled = False self.entire_dataset_training = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ test_scheduling_features.prev_failures(), ] if granularity == "label": feature_extractors += [ test_scheduling_features.platform(), # test_scheduling_features.chunk(), test_scheduling_features.suite(), ] elif granularity == "group": feature_extractors += [ test_scheduling_features.path_distance(), test_scheduling_features.common_path_components(), test_scheduling_features.touched_together(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, granularity="label"): Model.__init__(self, lemmatization) self.granularity = granularity self.required_dbs = [repository.COMMITS_DB] if granularity == "label": self.required_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) elif granularity == "group": self.required_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB) self.cross_validation_enabled = False self.entire_dataset_training = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), test_scheduling_features.name(), test_scheduling_features.prev_failures(), ] if granularity == "label": feature_extractors += [ test_scheduling_features.platform(), test_scheduling_features.chunk(), test_scheduling_features.suite(), ] elif granularity == "group": feature_extractors += [ test_scheduling_features.path_distance(), test_scheduling_features.common_path_components(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def test_path_distance(): pd = test_scheduling_features.path_distance() assert (pd( {"name": "dom/media/tests/mochitest.ini"}, {"files": ["dom/media/tests/test.js", "dom/media/anotherFile.cpp"]}, ) == 0) assert (pd( {"name": "dom/media/tests/mochitest.ini"}, {"files": ["dom/media/anotherFile.cpp"]}, ) == 1) assert (pd( {"name": "dom/media/tests/mochitest.ini"}, {"files": ["dom/media/src/aFile.cpp"]}, ) == 2) assert (pd( {"name": "dom/media/tests/mochitest.ini"}, {"files": ["dom/media/src/aFile.cpp", "dom/media/anotherFile.cpp"]}, ) == 1) assert (pd( {"name": "dom/media/tests/mochitest.ini"}, {"files": ["layout/utils/bla.cpp"]}, ) == 5)
def test_path_distance(): pd = test_scheduling_features.path_distance() assert (pd( {"name": "dom/media/tests/mochitest.ini"}, {"files": ["dom/media/tests/test.js", "dom/media/anotherFile.cpp"]}, ) == 0) assert (pd( {"name": "dom/media/tests/mochitest.ini"}, {"files": ["dom/media/anotherFile.cpp"]}, ) == 1) assert (pd( {"name": "dom/media/tests/mochitest.ini"}, {"files": ["dom/media/src/aFile.cpp"]}, ) == 2) assert (pd( {"name": "dom/media/tests/mochitest.ini"}, {"files": ["dom/media/src/aFile.cpp", "dom/media/anotherFile.cpp"]}, ) == 1) assert (pd( {"name": "dom/media/tests/mochitest.ini"}, {"files": ["layout/utils/bla.cpp"]}, ) == 5) assert (pd( { "name": "testing/web-platform/tests/content-security-policy/worker-src" }, {"files": ["test"]}, ) == 4) assert (pd( {"name": "test"}, { "files": ["testing/web-platform/tests/content-security-policy/worker-src"] }, ) == 4)
def __init__(self, lemmatization=False, granularity="label", use_subset=False): Model.__init__(self, lemmatization) self.granularity = granularity # This is useful for development purposes, it avoids using too much memory # by using a subset of the dataset (dropping some passing runnables). self.use_subset = use_subset self.training_dbs = [repository.COMMITS_DB] self.eval_dbs[repository.COMMITS_DB] = ( repository.COMMITS_DB, repository.COMMIT_EXPERIENCES_DB, ) if granularity == "label": self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_LABEL_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_LABEL_DB, test_scheduling.FAILING_TOGETHER_LABEL_DB, ) elif granularity == "group": self.training_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_GROUP_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) self.cross_validation_enabled = False self.entire_dataset_training = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ test_scheduling_features.prev_failures(), ] if granularity == "label": feature_extractors += [ test_scheduling_features.platform(), # test_scheduling_features.chunk(), test_scheduling_features.suite(), ] elif granularity == "group": feature_extractors += [ commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), test_scheduling_features.path_distance(), test_scheduling_features.common_path_components(), test_scheduling_features.touched_together(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")