def plot_cumm_mr(redis_db, num_iterations, workload_config, filter_config): mr_count = 0 tbot_metric = workload_config['tbot_metric'] all_mrs = get_all_mrs(redis_db) mr_to_performance = {} for exp_index in range(num_iterations): current_performance = tbot_datastore.read_summary_redis(redis_db, exp_index) # Count the number of MRs considered the filtering process # Just doing this manually because Michael be lazy. if filter_config['filter_policy'] == 'pipeline': mr_to_performance[mr_count] = current_performance mr_count += 1 elif filter_config['filter_policy'] is None: mr_count += 0 # Count number of MRs considered in the standard approach for mr in all_mrs: metric = tbot_datastore.read_redis_result(redis_db, exp_index, mr, tbot_metric) if len(metric) != 0: mr_to_performance[mr_count] = current_performance mr_count += 1 # Plot results from mr_to_performance get_by_mr_performance_charts(workload_config, num_iterations, mr_to_performance) # INCOMPLETE # Plot the results of the by MR performance def get_by_mr_performance_charts(workload_config, num_iterations, mr_to_performance): experiment_type = workload_config['type'] # Creating general performance chart chart_directory = 'results/graphs/mr/{}/'.format(workload_config['type'] + str(time_id)) plt.plot(*zip(*sorted(mr_to_performance.items()))) plt.title('{} Performance Over Time'.format(workload_config['type'])) plt.xlabel('Elapsed Time (seconds)') plt.ylabel('Latency_99 (ms)') chart_name = '{}{}{}performance.png'.format(chart_directory, num_iterations, experiment_type) plt.savefig(chart_name, bbox_inches='tight') plt.clf() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--config_file", help='Configuration file for Throttlebot execution') parser.add_argument("--reset_resources", action="store_true", help="Reset all resource allocation") parser.add_argument("--plot_cumm_mr", type=int, default=0, help="Plots the Performance vs cumulative MRs explored") args = parser.parse_args() sys_config,workload_config,filter_config = parse_config_file(args.config_file) redis_host = 'localhost' redis_db = redis.StrictRedis(host=redis_host, port=6379, db=0) if args.reset_resources: reset_resources() elif args.plot_cumm_mr != 0: print 'Plotting up to {} iterations'.format(args.plot_cumm_mr) plot_cumm_mr(redis_db, args.plot_cumm_mr, workload_config, filter_config)
def print_all_steps(redis_db, total_experiments): print 'Steps towards improving performance' for experiment_count in range(total_experiments): mimr, action_taken, perf_improvement = tbot_datastore.read_summary_redis( redis_db, experiment_count) print 'Iteration {}, Mimr = {}, New allocation = {}, Performance Improvement = {}'.format( experiment_count, mimr, action_taken, perf_improvement)
def get_performance_over_time_chart(redis_db, experiment_type, experiment_iteration_count, chart_directory): # Creating performance over time chart x = [] y = [] for iteration in range(experiment_iteration_count + 1): _, _, _, _, curr_perf, elaps_time, _ = tbot_datastore.read_summary_redis( redis_db, iteration) x.append(elaps_time) y.append(curr_perf) plt.plot(x, y, drawstyle='steps-post') plt.title('{} Performance Over Time'.format(experiment_type)) plt.xlabel('Elapsed Time (seconds)') plt.ylabel('Latency_99 (ms)') chart_name = '{}{}{}performance_time.png'.format( chart_directory, experiment_iteration_count, experiment_type) plt.savefig(chart_name) plt.clf()
def get_performance_over_mr_chart(redis_db, experiment_type, experiment_iteration_count, chart_directory): x = [] y = [] for iteration in range(experiment_iteration_count + 1): _, _, _, _, curr_perf, _, cumulative_mr = tbot_datastore.read_summary_redis( redis_db, iteration) x.append(cumulative_mr) y.append(curr_perf) plt.plot(x, y, drawstyle='steps-post') plt.title( '{} Performance Over Number of MRs Stressed'.format(experiment_type)) plt.xlabel('Number of MRs Stressed') plt.ylabel('Latency_99 (ms)') chart_name = '{}{}{}performance_mr.png'.format(chart_directory, experiment_iteration_count, experiment_type) plt.savefig(chart_name) plt.clf()
def run(sys_config, workload_config, filter_config, default_mr_config, last_completed_iter=0): redis_host = sys_config['redis_host'] baseline_trials = sys_config['baseline_trials'] experiment_trials = sys_config['trials'] stress_weights = sys_config['stress_weights'] stress_policy = sys_config['stress_policy'] resource_to_stress = sys_config['stress_these_resources'] service_to_stress = sys_config['stress_these_services'] vm_to_stress = sys_config['stress_these_machines'] machine_type = sys_config['machine_type'] quilt_overhead = sys_config['quilt_overhead'] gradient_mode = sys_config['gradient_mode'] preferred_performance_metric = workload_config['tbot_metric'] optimize_for_lowest = workload_config['optimize_for_lowest'] redis_db = redis.StrictRedis(host=redis_host, port=6379, db=0) if last_completed_iter == 0: redis_db.flushall() ''' # Prompt the user to make sure they want to flush the db ok_to_flush = raw_input("Are you sure you want to flush the results of your last experiment? Please respond with Y or N: ") if ok_to_flush == 'Y': redis_db.flushall() elif ok_to_flush == 'N': print 'OK you said it boss. Exiting...' exit() else: print 'Only Y and N are acceptable responses. Exiting...' exit() ''' print '\n' * 2 print '*' * 20 print 'INFO: INITIALIZING RESOURCE CONFIG' # Initialize Redis and Cluster based on the default resource configuration init_cluster_capacities_r(redis_db, machine_type, quilt_overhead) init_service_placement_r(redis_db, default_mr_config) init_resource_config(redis_db, default_mr_config, machine_type) print '*' * 20 print 'INFO: INSTALLING DEPENDENCIES' #install_dependencies(workload_config) # Initialize time for data charts time_start = datetime.datetime.now() print '*' * 20 print 'INFO: RUNNING BASELINE' # Get the Current Performance -- not used for any analysis, just to benchmark progress!! current_performance = measure_baseline(workload_config, baseline_trials) current_performance[preferred_performance_metric] = remove_outlier( current_performance[preferred_performance_metric]) current_time_stop = datetime.datetime.now() time_delta = current_time_stop - time_start print 'Current (non-analytic) performance measured: {}'.format( current_performance) if last_completed_iter != 0: tbot_datastore.write_summary_redis( redis_db, 0, MR('initial', 'initial', []), 0, {}, mean_list(current_performance[preferred_performance_metric]), mean_list(current_performance[preferred_performance_metric]), time_delta.seconds, 0) print '============================================' print '\n' * 2 # Initialize the current configurations # Initialize the working set of MRs to all the MRs mr_working_set = resource_datastore.get_all_mrs(redis_db) resource_datastore.write_mr_working_set(redis_db, mr_working_set, 0) cumulative_mr_count = 0 experiment_count = last_completed_iter + 1 while experiment_count < 10: # Calculate the analytic baseline that is used to determine MRs analytic_provisions = prepare_analytic_baseline( redis_db, sys_config, min(stress_weights)) print 'The Analytic provisions are as follows {}'.format( analytic_provisions) for mr in analytic_provisions: resource_modifier.set_mr_provision(mr, analytic_provisions[mr]) analytic_baseline = measure_runtime(workload_config, experiment_trials) analytic_mean = mean_list( analytic_baseline[preferred_performance_metric]) print 'The analytic baseline is {}'.format(analytic_baseline) print 'This current performance is {}'.format(current_performance) analytic_baseline[preferred_performance_metric] = remove_outlier( analytic_baseline[preferred_performance_metric]) # Get a list of MRs to stress in the form of a list of MRs mr_to_consider = apply_filtering_policy(redis_db, mr_working_set, experiment_count, sys_config, workload_config, filter_config) for mr in mr_to_consider: print '\n' * 2 print '*' * 20 print 'Current MR is {}'.format(mr.to_string()) increment_to_performance = {} current_mr_allocation = resource_datastore.read_mr_alloc( redis_db, mr) print 'Current MR allocation is {}'.format(current_mr_allocation) for stress_weight in stress_weights: # Calculate Gradient Schedule and provision resources accordingly mr_gradient_schedule = calculate_mr_gradient_schedule( redis_db, [mr], sys_config, stress_weight) for change_mr in mr_gradient_schedule: resource_modifier.set_mr_provision( change_mr, mr_gradient_schedule[change_mr]) experiment_results = measure_runtime(workload_config, experiment_trials) # Write results of experiment to Redis # preferred_results = remove_outlier(experiment_results[preferred_performance_metric]) preferred_results = experiment_results[ preferred_performance_metric] mean_result = mean_list(preferred_results) tbot_datastore.write_redis_ranking( redis_db, experiment_count, preferred_performance_metric, mean_result, mr, stress_weight) # Revert the Gradient schedule and provision resources accordingly mr_revert_gradient_schedule = revert_mr_gradient_schedule( redis_db, [mr], sys_config, stress_weight) for change_mr in mr_revert_gradient_schedule: resource_modifier.set_mr_provision( change_mr, mr_revert_gradient_schedule[change_mr]) increment_to_performance[stress_weight] = experiment_results # Write the results of the iteration to Redis tbot_datastore.write_redis_results(redis_db, mr, increment_to_performance, experiment_count, preferred_performance_metric) print '*' * 20 print '\n' * 2 # Timing Information for the purpose of experiments current_time_stop = datetime.datetime.now() time_delta = current_time_stop - time_start cumulative_mr_count += len(mr_to_consider) chart_generator.get_summary_mimr_charts( redis_db, workload_config, current_performance, mr_working_set, experiment_count, stress_weights, preferred_performance_metric, time_start) # Move back into the normal operating basis by removing the baseline prep stresses reverted_analytic_provisions = revert_analytic_baseline( redis_db, sys_config) for mr in reverted_analytic_provisions: resource_modifier.set_mr_provision( mr, reverted_analytic_provisions[mr]) # Recover the results of the experiment from Redis max_stress_weight = min(stress_weights) mimr_list = tbot_datastore.get_top_n_mimr( redis_db, experiment_count, preferred_performance_metric, max_stress_weight, gradient_mode, optimize_for_lowest=optimize_for_lowest, num_results_returned=-1) imr_list, nimr_list = seperate_mr( mimr_list, mean_list(analytic_baseline[preferred_performance_metric]), optimize_for_lowest) if len(imr_list) == 0: print 'INFO: IMR list length is 0. Please choose a metric with more signal. Exiting...' break print 'INFO: IMR list is {}'.format( [mr.to_string() for mr in imr_list]) print 'INFO: NIMR list is {}'.format( [mr.to_string() for mr in nimr_list]) # Try all the MIMRs in the list until a viable improvement is determined # Improvement Amount mimr = None action_taken = {} for imr in imr_list: imr_improvement_percent = improve_mr_by(redis_db, imr, max_stress_weight) current_imr_alloc = resource_datastore.read_mr_alloc(redis_db, imr) new_imr_alloc = convert_percent_to_raw(imr, current_imr_alloc, imr_improvement_percent) imr_improvement_proposal = new_imr_alloc - current_imr_alloc # If the the Proposed MR cannot be improved by the proposed amount, there are two options # - Max out the resources to fill up the remaining resources on the machine # - Resource Stealing from NIMRs # Both functions will return VIABLE improvements to the IMR deployment nimr_diff_proposal = {} if check_improve_mr_viability(redis_db, imr, imr_improvement_proposal) is False: print 'INFO: MR {} to increase {} by {} is not viable'.format( imr.to_string(), current_imr_alloc, imr_improvement_proposal) print 'INFO: Attempting to max out the machines resources...' imr_improvement_proposal = fill_out_resource(redis_db, imr) if imr_improvement_proposal <= 0: print 'INFO: No more space to fill out resources. Stealing from NIMRs' # Calculate a plan to reduce the resource provisioning of NIMRs nimr_diff_proposal, imr_improvement_proposal = create_decrease_nimr_schedule( redis_db, imr, nimr_list, max_stress_weight) print 'INFO: Proposed NIMR {}'.format(nimr_diff_proposal) print 'INFO: New IMR improvement {}'.format( imr_improvement_proposal) if len(nimr_diff_proposal ) == 0 or imr_improvement_proposal == 0: action_taken[imr] = 0 continue # Decrease the amount of resources provisioned to the NIMR for nimr in nimr_diff_proposal: action_taken[nimr] = nimr_diff_proposal[nimr] new_nimr_alloc = resource_datastore.read_mr_alloc( redis_db, nimr) + nimr_diff_proposal[nimr] print 'NIMR stealing: imposing a change of {} on {}'.format( action_taken[nimr], nimr.to_string()) finalize_mr_provision(redis_db, nimr, new_nimr_alloc) # Improving the resource should always be viable at this step if check_improve_mr_viability(redis_db, imr, imr_improvement_proposal): new_imr_alloc = imr_improvement_proposal + current_imr_alloc action_taken[imr] = imr_improvement_proposal finalize_mr_provision(redis_db, imr, new_imr_alloc) print 'Improvement Calculated: MR {} increase from {} to {}'.format( mr.to_string(), current_imr_alloc, new_imr_alloc) mimr = imr break else: action_taken[imr] = 0 print 'Improvement Calculated: MR {} failed to improve from {}'.format( mr.to_string(), current_mr_allocation) print 'This IMR cannot be improved. Printing some debugging before exiting...' print 'Current MR allocation is {}'.format(current_imr_alloc) print 'Proposed (failed) allocation is {}, improved by {}'.format( new_imr_alloc, imr_improvement_proposal) for deployment in imr.instances: vm_ip, container = deployment capacity = resource_datastore.read_machine_capacity( redis_db, vm_ip) consumption = resource_datastore.read_machine_consumption( redis_db, vm_ip) print 'Machine {} Capacity is {}, and consumption is currently {}'.format( vm_ip, capacity, consumption) if mimr is None: print 'No viable improvement found' break #Compare against the baseline at the beginning of the program improved_performance = measure_runtime(workload_config, baseline_trials) # improved_performance[preferred_performance_metric] = remove_outlier(improved_performance[preferred_performance_metric]) improved_mean = mean_list( improved_performance[preferred_performance_metric]) previous_mean = mean_list( current_performance[preferred_performance_metric]) performance_improvement = improved_mean - previous_mean # Write a summary of the experiment's iterations to Redis tbot_datastore.write_summary_redis(redis_db, experiment_count, mimr, performance_improvement, action_taken, analytic_mean, improved_mean, time_delta.seconds, cumulative_mr_count) current_performance = improved_performance # Generating overall performance improvement chart_generator.get_summary_performance_charts(redis_db, workload_config, experiment_count, time_start) results = tbot_datastore.read_summary_redis(redis_db, experiment_count) print 'Results from iteration {} are {}'.format( experiment_count, results) # Checkpoint MR configurations and print current_mr_config = resource_datastore.read_all_mr_alloc(redis_db) print_csv_configuration(current_mr_config) experiment_count += 1 print '{} experiments completed'.format(experiment_count) print_all_steps(redis_db, experiment_count) current_mr_config = resource_datastore.read_all_mr_alloc(redis_db) for mr in current_mr_config: print '{} = {}'.format(mr.to_string(), current_mr_config[mr]) print_csv_configuration(current_mr_config)
def print_all_steps(redis_db, total_experiments): print 'Steps towards improving performance' net_improvement = 0 for experiment_count in range(1, total_experiments): mimr, action_taken, perf_improvement, analytic_perf, current_perf, elapsed_time, cumm_mr = tbot_datastore.read_summary_redis( redis_db, experiment_count) print 'Iteration {}, Mimr = {}, New allocation = {}, Performance Improvement = {}, Analytic Performance = {}, Performance after improvement = {}, Elapsed Time = {}, Cummulative MR = {}'.format( experiment_count, mimr, action_taken, perf_improvement, analytic_perf, current_perf, elapsed_time, cumm_mr) # Append results to log file with open("experiment_logs.txt", "a") as myfile: log_msg = '{},{},{}\n'.format(experiment_count, mimr, action_taken) myfile.write(log_msg) net_improvement += float(perf_improvement) print 'Net Improvement: {}'.format(net_improvement) with open("experiment_logs.txt", "a") as myfile: myfile.write('net_improvement,{}\n'.format(net_improvement))
def run(system_config, workload_config, default_mr_config): redis_host = system_config['redis_host'] baseline_trials = system_config['baseline_trials'] experiment_trials = system_config['trials'] stress_weights = system_config['stress_weights'] stress_policy = system_config['stress_policy'] resource_to_stress = system_config['stress_these_resources'] service_to_stress = system_config['stress_these_services'] vm_to_stress = system_config['stress_these_machines'] machine_type = system_config['machine_type'] quilt_overhead = system_config['quilt_overhead'] preferred_performance_metric = workload_config['tbot_metric'] optimize_for_lowest = workload_config['optimize_for_lowest'] redis_db = redis.StrictRedis(host=redis_host, port=6379, db=0) redis_db.flushall() # Initialize Redis and Cluster based on the default resource configuration init_cluster_capacities_r(redis_db, machine_type, quilt_overhead) init_service_placement_r(redis_db, default_mr_config) init_resource_config(redis_db, default_mr_config, machine_type) # Run the baseline experiment experiment_count = 0 baseline_performance = measure_baseline(workload_config, baseline_trials) # Initialize the current configurations # Invariant: MR are the same between iterations current_mr_config = resource_datastore.read_all_mr_alloc(redis_db) while experiment_count < 10: # Get a list of MRs to stress in the form of a list of MRs mr_to_stress = generate_mr_from_policy(redis_db, stress_policy) print mr_to_stress for mr in mr_to_stress: print 'Current MR is {}'.format(mr.to_string()) increment_to_performance = {} current_mr_allocation = resource_datastore.read_mr_alloc( redis_db, mr) print 'Current MR allocation is {}'.format(current_mr_allocation) for stress_weight in stress_weights: new_alloc = convert_percent_to_raw(mr, current_mr_allocation, stress_weight) set_mr_provision(mr, new_alloc) experiment_results = measure_runtime(workload_config, experiment_trials) #Write results of experiment to Redis mean_result = float( sum(experiment_results[preferred_performance_metric]) ) / len(experiment_results[preferred_performance_metric]) tbot_datastore.write_redis_ranking( redis_db, experiment_count, preferred_performance_metric, mean_result, mr, stress_weight) # Remove the effect of the resource stressing new_alloc = convert_percent_to_raw(mr, current_mr_allocation, 0) increment_to_performance[stress_weight] = experiment_results # Write the results of the iteration to Redis tbot_datastore.write_redis_results(redis_db, mr, increment_to_performance, experiment_count, preferred_performance_metric) # Recover the results of the experiment from Redis max_stress_weight = min(stress_weights) mimr_list = tbot_datastore.get_top_n_mimr( redis_db, experiment_count, preferred_performance_metric, max_stress_weight, optimize_for_lowest=optimize_for_lowest, num_results_returned=10) # Try all the MIMRs in the list until a viable improvement is determined # Improvement Amount mimr = None action_taken = 0 print 'The MR improvement is {}'.format(max_stress_weight) for mr_score in mimr_list: mr, score = mr_score improvement_percent = improve_mr_by(redis_db, mr, max_stress_weight) current_mr_allocation = resource_datastore.read_mr_alloc( redis_db, mr) new_alloc = convert_percent_to_raw(mr, current_mr_allocation, improvement_percent) improvement_amount = new_alloc - current_mr_allocation action_taken = improvement_amount if check_improve_mr_viability(redis_db, mr, improvement_amount): set_mr_provision(mr, new_alloc) print 'Improvement Calculated: MR {} increase from {} to {}'.format( mr.to_string(), current_mr_allocation, new_alloc) old_alloc = resource_datastore.read_mr_alloc(redis_db, mr) resource_datastore.write_mr_alloc(redis_db, mr, new_alloc) update_machine_consumption(redis_db, mr, new_alloc, old_alloc) current_mr_config = update_mr_config(redis_db, current_mr_config) mimr = mr break else: print 'Improvement Calculated: MR {} failed to improve from {} to {}'.format( mr.to_string(), current_mr_allocation, new_alloc) if mimr is None: print 'No viable improvement found' break #Compare against the baseline at the beginning of the program improved_performance = measure_runtime(workload_config, baseline_trials) print improved_performance improved_mean = sum( improved_performance[preferred_performance_metric]) / float( len(improved_performance[preferred_performance_metric])) baseline_mean = sum( baseline_performance[preferred_performance_metric]) / float( len(baseline_performance[preferred_performance_metric])) performance_improvement = improved_mean - baseline_mean # Write a summary of the experiment's iterations to Redis tbot_datastore.write_summary_redis(redis_db, experiment_count, mimr, performance_improvement, action_taken) baseline_performance = improved_performance results = tbot_datastore.read_summary_redis(redis_db, experiment_count) print 'Results from iteration {} are {}'.format( experiment_count, results) experiment_count += 1 # TODO: Handle False Positive # TODO: Compare against performance condition -- for now only do some number of experiments print '{} experiments completed'.format(experiment_count) print_all_steps(redis_db, experiment_count) for mr in current_mr_config: print '{} = {}'.format(mr.to_string(), current_mr_config[mr])