def test_gaussian_model_predict(self): training_data = TrainingData() config_set = UniversalConfigSet(4, 26544) model = GaussianModel(config_set, training_data) training_sample_1 = { "spark.executor.memory": 11945, "spark.sql.shuffle.partitions": 200, "spark.executor.cores": 2, "spark.driver.memory": 1024 * 4, "spark.sql.autoBroadcastJoinThreshold": 10, "spark.sql.statistics.fallBackToHdfs": 0 } training_sample_2 = { "spark.executor.memory": 5972, "spark.sql.shuffle.partitions": 300, "spark.executor.cores": 1, "spark.driver.memory": 1024 * 2, "spark.sql.autoBroadcastJoinThreshold": 10, "spark.sql.statistics.fallBackToHdfs": 0 } training_sample_3 = { "spark.executor.memory": 11945, "spark.sql.shuffle.partitions": 460, "spark.executor.cores": 2, "spark.driver.memory": 1024 * 4, "spark.sql.autoBroadcastJoinThreshold": 10, "spark.sql.statistics.fallBackToHdfs": 0 } training_sample_4 = { "spark.executor.memory": 10068, "spark.sql.shuffle.partitions": 1660, "spark.executor.cores": 1, "spark.driver.memory": 1024, "spark.sql.autoBroadcastJoinThreshold": 10, "spark.sql.statistics.fallBackToHdfs": 0 } model.add_sample_to_train_data(training_sample_1, 131) model.add_sample_to_train_data(training_sample_2, 143) model.add_sample_to_train_data(training_sample_3, 155) model.add_sample_to_train_data(training_sample_4, 343) model.train() config = Config() params = config_set.get_params() for param in params: if param.get_name() == 'spark.executor.memory': config.add_param(param, 10068) elif param.get_name() == 'spark.sql.shuffle.partitions': config.add_param(param, 1660) elif param.get_name() == 'spark.executor.cores': config.add_param(param, 1) elif param.get_name() == 'spark.driver.memory': config.add_param(param, 1024) elif param.get_name() == 'spark.sql.autoBroadcastJoinThreshold': config.add_param(param, 10) elif param.get_name() == 'spark.sql.statistics.fallBackToHdfs': config.add_param(param, 0) low, high = model.predict( model.normalizer.normalize_config(config.get_all_param_values())) assert low > (343 - 1) assert high < (343 + 1)
def __init__(self, num_cores, total_memory, training_data=TrainingData()): self.num_cores = num_cores self.total_memory = total_memory self.config_set = UniversalConfigSet(num_cores, total_memory) self.training_data = training_data if self.training_data.size() > 0: self.ml_model = GaussianModel(self.config_set, training_data) self.math_model = AbstractModel(self.config_set, training_data, num_cores, total_memory)
def test_denormalization(): config_set = UniversalConfigSet(4, 28 * 1024) normalizer = ConfigNormalizer(config_set) norm_configs = normalizer.get_all_possible_normalized_configs() denorm_config = normalizer.denormalize_config(norm_configs) i = 0 for param in config_set.get_params(): domain = param.get_domain() assert sorted(domain.get_possible_values()) == sorted(denorm_config[i]) i = i + 1
class Combiner: def __init__(self, num_cores, total_memory, training_data=TrainingData()): self.num_cores = num_cores self.total_memory = total_memory self.config_set = UniversalConfigSet(num_cores, total_memory) self.training_data = training_data if self.training_data.size() > 0: self.ml_model = GaussianModel(self.config_set, training_data) self.math_model = AbstractModel(self.config_set, training_data, num_cores, total_memory) def add_training_data(self, training_sample, output): self.training_data.add_training_data(self._get_training_config(training_sample), output) self.ml_model = GaussianModel(self.config_set, self.training_data) self.math_model = AbstractModel(self.config_set, self.training_data, self.num_cores, self.total_memory) def _get_training_config(self, training_sample): conf_names_params_mapping = {} for param in self.config_set.get_params(): conf_names_params_mapping[param.get_name()] = param training_config = Config() for config_name, config_value in training_sample.items(): training_config.add_param(conf_names_params_mapping[config_name], training_sample[config_name]) return training_config def get_best_config(self): if self.training_data.size() == 0: raise ValueError("Training Data Not Provided") if self.training_data.size() == 1: return self.math_model.get_best_config() self.ml_model.train() sampled_configs = self.ml_model.get_sampled_configs() pruned_configs = self.math_model.get_pruned_config(sampled_configs) return self.ml_model.get_best_config_for_config_space(pruned_configs)
def test_discretizer_for_r4_xlarge(): config_set = UniversalConfigSet(4, 26544) normalizer = ConfigNormalizer(config_set) norm_configs = normalizer.get_all_possible_normalized_configs() sampler = LhsDiscreteSampler(norm_configs) samples = sampler.get_samples(2) assert max(list(map(lambda x: len(x), norm_configs))) == len(samples) assert all(map(lambda x: len(x) == len(norm_configs), samples))
def test_normalization(): config_set = UniversalConfigSet(4, 28 * 1024) normalizer = ConfigNormalizer(config_set) norm_configs = normalizer.get_all_possible_normalized_configs() assert len(norm_configs) == 6 assert len(norm_configs[0]) == 200 assert len(norm_configs[1]) == 49 assert len(norm_configs[2]) == 22 assert len(norm_configs[3]) == 4 assert len(norm_configs[4]) == 19 assert len(norm_configs[5]) == 2
def test_normalization(): config_set = UniversalConfigSet(4, 28 * 1024) normalizer = ConfigNormalizer(config_set) norm_configs = normalizer.get_all_possible_normalized_configs() expected_config = [ [ 1.0, 0.9743589743589743, 0.9487179487179487, 0.9230769230769231, 0.8974358974358975, 0.8717948717948718, 0.8461538461538461, 0.8205128205128205, 0.7948717948717948, 0.7692307692307693, 0.7435897435897436, 0.717948717948718, 0.6923076923076923, 0.6666666666666666, 0.6410256410256411, 0.6153846153846154, 0.5897435897435898, 0.5641025641025641, 0.5384615384615384, 0.5128205128205128, 0.48717948717948717, 0.46153846153846156, 0.4358974358974359, 0.41025641025641024, 0.38461538461538464, 0.358974358974359, 0.3333333333333333, 0.3076923076923077, 0.28205128205128205, 0.2564102564102564, 0.23076923076923078, 0.20512820512820512, 0.1794871794871795, 0.15384615384615385, 0.1282051282051282, 0.10256410256410256, 0.07692307692307693, 0.05128205128205128, 0.02564102564102564, -0.0 ], [ 1.0, 0.972972972972973, 0.945945945945946, 0.918918918918919, 0.8918918918918919, 0.8648648648648649, 0.8378378378378379, 0.8108108108108109, 0.7837837837837838, 0.7567567567567568, 0.7297297297297298, 0.7027027027027027, 0.6756756756756757, 0.6486486486486487, 0.6216216216216217, 0.5945945945945946, 0.5675675675675675, 0.5405405405405406, 0.5135135135135136, 0.4864864864864865, 0.4594594594594595, 0.43243243243243246, 0.40540540540540543, 0.3783783783783784, 0.35135135135135137, 0.32432432432432434, 0.2972972972972973, 0.2702702702702703, 0.24324324324324326, 0.21621621621621623, 0.1891891891891892, 0.16216216216216217, 0.13513513513513514, 0.10810810810810811, 0.08108108108108109, 0.05405405405405406, 0.02702702702702703, -0.0 ], [ 1.0, 0.9629629629629629, 0.9259259259259258, 0.8888888888888888, 0.8518518518518519, 0.8148148148148148, 0.7777777777777777, 0.7407407407407407, 0.7037037037037037, 0.6666666666666666, 0.6296296296296295, 0.5925925925925926, 0.5555555555555556, 0.5185185185185185, 0.48148148148148145, 0.4444444444444444, 0.4074074074074074, 0.37037037037037035, 0.3333333333333333, 0.2962962962962963, 0.25925925925925924, 0.2222222222222222, 0.18518518518518517, 0.14814814814814814, 0.1111111111111111, 0.07407407407407407, 0.037037037037037035, -0.0 ], [1.0, 0.5, -0.0] ] assert norm_configs == expected_config
def test_gaussian_model_get_best_config_value(self): training_data = TrainingData() config_set = UniversalConfigSet(4, 26544) model = GaussianModel(config_set, training_data) training_sample_1 = { "spark.executor.memory": 11945, "spark.sql.shuffle.partitions": 200, "spark.executor.cores": 2, "spark.driver.memory": 1024 * 4, "spark.sql.autoBroadcastJoinThreshold": 10, "spark.sql.statistics.fallBackToHdfs": 0 } training_sample_2 = { "spark.executor.memory": 5972, "spark.sql.shuffle.partitions": 300, "spark.executor.cores": 1, "spark.driver.memory": 1024 * 2, "spark.sql.autoBroadcastJoinThreshold": 10, "spark.sql.statistics.fallBackToHdfs": 0 } training_sample_3 = { "spark.executor.memory": 11945, "spark.sql.shuffle.partitions": 460, "spark.executor.cores": 2, "spark.driver.memory": 1024 * 4, "spark.sql.autoBroadcastJoinThreshold": 10, "spark.sql.statistics.fallBackToHdfs": 0 } training_sample_4 = { "spark.executor.memory": 10068, "spark.sql.shuffle.partitions": 1660, "spark.executor.cores": 1, "spark.driver.memory": 1024, "spark.sql.autoBroadcastJoinThreshold": 10, "spark.sql.statistics.fallBackToHdfs": 0 } model.add_sample_to_train_data(training_sample_1, 131) model.add_sample_to_train_data(training_sample_2, 143) model.add_sample_to_train_data(training_sample_3, 155) model.add_sample_to_train_data(training_sample_4, 343) model.train() best_config = model.get_best_config() assert True
def test_univarsal_config_set_exception(num_cores, memory): UniversalConfigSet(num_cores, memory)
def test_universal_config_set(num_cores, memory, param_list): univ_config = UniversalConfigSet(num_cores, memory) assert (univ_config.get_params() == param_list)
def config_set(): return UniversalConfigSet(4, 26544)
def gaussian_model(training_data): config_set = UniversalConfigSet(10, 1024 * 10) model = GaussianModel(config_set, training_data) model.train() return model
def gaussian_model(): training_data = TrainingData() config_set = UniversalConfigSet(10, 1024 * 10) yield GaussianModel(config_set, training_data)