def fill_out_resource(redis_db, imr): improvement_proposal = float('inf') improvement_multiplier = containers_per_vm(imr) for instance in imr.instances: vm_ip, container = instance consumption = resource_datastore.read_machine_consumption( redis_db, vm_ip) capacity = resource_datastore.read_machine_capacity(redis_db, vm_ip) diff = capacity[imr.resource] - consumption[imr.resource] # Divide diff by the number of containers of that services on that machine diff = diff / float(improvement_multiplier[vm_ip]) if diff < improvement_proposal: improvement_proposal = diff debug_statement = 'For vm ip {}, capacity {}, consumption {}, diff {}\n'.format( vm_ip, capacity, consumption, diff) with open("fill_out_resource_debug.txt", "a") as myfile: myfile.write('imr is {}\n'.format(imr.resource)) myfile.write('current improvement proposal is {}\n'.format( improvement_proposal)) myfile.write(debug_statement) if improvement_proposal < 0: print 'WARNING: Improvement proposal is less than 0' print 'Check out fill_out_resource_debug.txt to help diagnose the problem' # get immediate results just by setting the proposal to zero in this case improvement_proposal = 0 return improvement_proposal
def update_machine_consumption(redis_db, mr, new_alloc, old_alloc): for instance in mr.instances: vm_ip, container_id = instance prior_consumption = resource_datastore.read_machine_consumption( redis_db, vm_ip) new_consumption = float( prior_consumption[mr.resource]) + new_alloc - old_alloc utilization_dict = {} utilization_dict[mr.resource] = new_consumption resource_datastore.write_machine_consumption(redis_db, vm_ip, utilization_dict)
def check_improve_mr_viability(redis_db, mr, improvement_amount): print 'Checking MR viability' # Check if available space on machines being tested for instance in mr.instances: vm_ip, container_id = instance machine_consumption = resource_datastore.read_machine_consumption( redis_db, vm_ip) machine_capacity = resource_datastore.read_machine_capacity( redis_db, vm_ip) if machine_consumption[ mr.resource] + improvement_amount > machine_capacity[ mr.resource]: return False return True
def check_improve_mr_viability(redis_db, mr, improvement_amount): print 'Checking MR viability' improvement_multiplier = containers_per_vm(mr) print 'The containers for this mr per vm are {}'.format( improvement_multiplier) # Check if available space on machines being tested for instance in mr.instances: vm_ip, container_id = instance machine_consumption = resource_datastore.read_machine_consumption( redis_db, vm_ip) machine_capacity = resource_datastore.read_machine_capacity( redis_db, vm_ip) proposed_alloc = machine_consumption[mr.resource] + ( improvement_multiplier[vm_ip] * improvement_amount) if proposed_alloc > machine_capacity[mr.resource]: return False return True
def run(sys_config, workload_config, filter_config, default_mr_config, last_completed_iter=0): redis_host = sys_config['redis_host'] baseline_trials = sys_config['baseline_trials'] experiment_trials = sys_config['trials'] stress_weights = sys_config['stress_weights'] stress_policy = sys_config['stress_policy'] resource_to_stress = sys_config['stress_these_resources'] service_to_stress = sys_config['stress_these_services'] vm_to_stress = sys_config['stress_these_machines'] machine_type = sys_config['machine_type'] quilt_overhead = sys_config['quilt_overhead'] gradient_mode = sys_config['gradient_mode'] preferred_performance_metric = workload_config['tbot_metric'] optimize_for_lowest = workload_config['optimize_for_lowest'] redis_db = redis.StrictRedis(host=redis_host, port=6379, db=0) if last_completed_iter == 0: redis_db.flushall() ''' # Prompt the user to make sure they want to flush the db ok_to_flush = raw_input("Are you sure you want to flush the results of your last experiment? Please respond with Y or N: ") if ok_to_flush == 'Y': redis_db.flushall() elif ok_to_flush == 'N': print 'OK you said it boss. Exiting...' exit() else: print 'Only Y and N are acceptable responses. Exiting...' exit() ''' print '\n' * 2 print '*' * 20 print 'INFO: INITIALIZING RESOURCE CONFIG' # Initialize Redis and Cluster based on the default resource configuration init_cluster_capacities_r(redis_db, machine_type, quilt_overhead) init_service_placement_r(redis_db, default_mr_config) init_resource_config(redis_db, default_mr_config, machine_type) print '*' * 20 print 'INFO: INSTALLING DEPENDENCIES' #install_dependencies(workload_config) # Initialize time for data charts time_start = datetime.datetime.now() print '*' * 20 print 'INFO: RUNNING BASELINE' # Get the Current Performance -- not used for any analysis, just to benchmark progress!! current_performance = measure_baseline(workload_config, baseline_trials) current_performance[preferred_performance_metric] = remove_outlier( current_performance[preferred_performance_metric]) current_time_stop = datetime.datetime.now() time_delta = current_time_stop - time_start print 'Current (non-analytic) performance measured: {}'.format( current_performance) if last_completed_iter != 0: tbot_datastore.write_summary_redis( redis_db, 0, MR('initial', 'initial', []), 0, {}, mean_list(current_performance[preferred_performance_metric]), mean_list(current_performance[preferred_performance_metric]), time_delta.seconds, 0) print '============================================' print '\n' * 2 # Initialize the current configurations # Initialize the working set of MRs to all the MRs mr_working_set = resource_datastore.get_all_mrs(redis_db) resource_datastore.write_mr_working_set(redis_db, mr_working_set, 0) cumulative_mr_count = 0 experiment_count = last_completed_iter + 1 while experiment_count < 10: # Calculate the analytic baseline that is used to determine MRs analytic_provisions = prepare_analytic_baseline( redis_db, sys_config, min(stress_weights)) print 'The Analytic provisions are as follows {}'.format( analytic_provisions) for mr in analytic_provisions: resource_modifier.set_mr_provision(mr, analytic_provisions[mr]) analytic_baseline = measure_runtime(workload_config, experiment_trials) analytic_mean = mean_list( analytic_baseline[preferred_performance_metric]) print 'The analytic baseline is {}'.format(analytic_baseline) print 'This current performance is {}'.format(current_performance) analytic_baseline[preferred_performance_metric] = remove_outlier( analytic_baseline[preferred_performance_metric]) # Get a list of MRs to stress in the form of a list of MRs mr_to_consider = apply_filtering_policy(redis_db, mr_working_set, experiment_count, sys_config, workload_config, filter_config) for mr in mr_to_consider: print '\n' * 2 print '*' * 20 print 'Current MR is {}'.format(mr.to_string()) increment_to_performance = {} current_mr_allocation = resource_datastore.read_mr_alloc( redis_db, mr) print 'Current MR allocation is {}'.format(current_mr_allocation) for stress_weight in stress_weights: # Calculate Gradient Schedule and provision resources accordingly mr_gradient_schedule = calculate_mr_gradient_schedule( redis_db, [mr], sys_config, stress_weight) for change_mr in mr_gradient_schedule: resource_modifier.set_mr_provision( change_mr, mr_gradient_schedule[change_mr]) experiment_results = measure_runtime(workload_config, experiment_trials) # Write results of experiment to Redis # preferred_results = remove_outlier(experiment_results[preferred_performance_metric]) preferred_results = experiment_results[ preferred_performance_metric] mean_result = mean_list(preferred_results) tbot_datastore.write_redis_ranking( redis_db, experiment_count, preferred_performance_metric, mean_result, mr, stress_weight) # Revert the Gradient schedule and provision resources accordingly mr_revert_gradient_schedule = revert_mr_gradient_schedule( redis_db, [mr], sys_config, stress_weight) for change_mr in mr_revert_gradient_schedule: resource_modifier.set_mr_provision( change_mr, mr_revert_gradient_schedule[change_mr]) increment_to_performance[stress_weight] = experiment_results # Write the results of the iteration to Redis tbot_datastore.write_redis_results(redis_db, mr, increment_to_performance, experiment_count, preferred_performance_metric) print '*' * 20 print '\n' * 2 # Timing Information for the purpose of experiments current_time_stop = datetime.datetime.now() time_delta = current_time_stop - time_start cumulative_mr_count += len(mr_to_consider) chart_generator.get_summary_mimr_charts( redis_db, workload_config, current_performance, mr_working_set, experiment_count, stress_weights, preferred_performance_metric, time_start) # Move back into the normal operating basis by removing the baseline prep stresses reverted_analytic_provisions = revert_analytic_baseline( redis_db, sys_config) for mr in reverted_analytic_provisions: resource_modifier.set_mr_provision( mr, reverted_analytic_provisions[mr]) # Recover the results of the experiment from Redis max_stress_weight = min(stress_weights) mimr_list = tbot_datastore.get_top_n_mimr( redis_db, experiment_count, preferred_performance_metric, max_stress_weight, gradient_mode, optimize_for_lowest=optimize_for_lowest, num_results_returned=-1) imr_list, nimr_list = seperate_mr( mimr_list, mean_list(analytic_baseline[preferred_performance_metric]), optimize_for_lowest) if len(imr_list) == 0: print 'INFO: IMR list length is 0. Please choose a metric with more signal. Exiting...' break print 'INFO: IMR list is {}'.format( [mr.to_string() for mr in imr_list]) print 'INFO: NIMR list is {}'.format( [mr.to_string() for mr in nimr_list]) # Try all the MIMRs in the list until a viable improvement is determined # Improvement Amount mimr = None action_taken = {} for imr in imr_list: imr_improvement_percent = improve_mr_by(redis_db, imr, max_stress_weight) current_imr_alloc = resource_datastore.read_mr_alloc(redis_db, imr) new_imr_alloc = convert_percent_to_raw(imr, current_imr_alloc, imr_improvement_percent) imr_improvement_proposal = new_imr_alloc - current_imr_alloc # If the the Proposed MR cannot be improved by the proposed amount, there are two options # - Max out the resources to fill up the remaining resources on the machine # - Resource Stealing from NIMRs # Both functions will return VIABLE improvements to the IMR deployment nimr_diff_proposal = {} if check_improve_mr_viability(redis_db, imr, imr_improvement_proposal) is False: print 'INFO: MR {} to increase {} by {} is not viable'.format( imr.to_string(), current_imr_alloc, imr_improvement_proposal) print 'INFO: Attempting to max out the machines resources...' imr_improvement_proposal = fill_out_resource(redis_db, imr) if imr_improvement_proposal <= 0: print 'INFO: No more space to fill out resources. Stealing from NIMRs' # Calculate a plan to reduce the resource provisioning of NIMRs nimr_diff_proposal, imr_improvement_proposal = create_decrease_nimr_schedule( redis_db, imr, nimr_list, max_stress_weight) print 'INFO: Proposed NIMR {}'.format(nimr_diff_proposal) print 'INFO: New IMR improvement {}'.format( imr_improvement_proposal) if len(nimr_diff_proposal ) == 0 or imr_improvement_proposal == 0: action_taken[imr] = 0 continue # Decrease the amount of resources provisioned to the NIMR for nimr in nimr_diff_proposal: action_taken[nimr] = nimr_diff_proposal[nimr] new_nimr_alloc = resource_datastore.read_mr_alloc( redis_db, nimr) + nimr_diff_proposal[nimr] print 'NIMR stealing: imposing a change of {} on {}'.format( action_taken[nimr], nimr.to_string()) finalize_mr_provision(redis_db, nimr, new_nimr_alloc) # Improving the resource should always be viable at this step if check_improve_mr_viability(redis_db, imr, imr_improvement_proposal): new_imr_alloc = imr_improvement_proposal + current_imr_alloc action_taken[imr] = imr_improvement_proposal finalize_mr_provision(redis_db, imr, new_imr_alloc) print 'Improvement Calculated: MR {} increase from {} to {}'.format( mr.to_string(), current_imr_alloc, new_imr_alloc) mimr = imr break else: action_taken[imr] = 0 print 'Improvement Calculated: MR {} failed to improve from {}'.format( mr.to_string(), current_mr_allocation) print 'This IMR cannot be improved. Printing some debugging before exiting...' print 'Current MR allocation is {}'.format(current_imr_alloc) print 'Proposed (failed) allocation is {}, improved by {}'.format( new_imr_alloc, imr_improvement_proposal) for deployment in imr.instances: vm_ip, container = deployment capacity = resource_datastore.read_machine_capacity( redis_db, vm_ip) consumption = resource_datastore.read_machine_consumption( redis_db, vm_ip) print 'Machine {} Capacity is {}, and consumption is currently {}'.format( vm_ip, capacity, consumption) if mimr is None: print 'No viable improvement found' break #Compare against the baseline at the beginning of the program improved_performance = measure_runtime(workload_config, baseline_trials) # improved_performance[preferred_performance_metric] = remove_outlier(improved_performance[preferred_performance_metric]) improved_mean = mean_list( improved_performance[preferred_performance_metric]) previous_mean = mean_list( current_performance[preferred_performance_metric]) performance_improvement = improved_mean - previous_mean # Write a summary of the experiment's iterations to Redis tbot_datastore.write_summary_redis(redis_db, experiment_count, mimr, performance_improvement, action_taken, analytic_mean, improved_mean, time_delta.seconds, cumulative_mr_count) current_performance = improved_performance # Generating overall performance improvement chart_generator.get_summary_performance_charts(redis_db, workload_config, experiment_count, time_start) results = tbot_datastore.read_summary_redis(redis_db, experiment_count) print 'Results from iteration {} are {}'.format( experiment_count, results) # Checkpoint MR configurations and print current_mr_config = resource_datastore.read_all_mr_alloc(redis_db) print_csv_configuration(current_mr_config) experiment_count += 1 print '{} experiments completed'.format(experiment_count) print_all_steps(redis_db, experiment_count) current_mr_config = resource_datastore.read_all_mr_alloc(redis_db) for mr in current_mr_config: print '{} = {}'.format(mr.to_string(), current_mr_config[mr]) print_csv_configuration(current_mr_config)