def record_serving_instances_avg_cpu_util(): workers_ids, num_workers = get_serving_instances() response = cw.put_metric_data(Namespace='AWS/EC2', MetricData=[ { 'MetricName': 'num-workers-average', 'Timestamp': datetime.now(), 'Value': num_workers, 'Dimensions': [ { 'Name': 'InstanceId', 'Value': 'i-078f69c8c9c0097d6' }, ], 'StorageResolution': 60, 'Unit': 'Count' }, ]) print('Number of healthy instances: ' + str(num_workers)) cpu_stats_list = [] avg_cpu_util = 0 if num_workers != 0: for worker_id in workers_ids: cpu_stats = get_single_instance_cpu_util(worker_id, 120) if len(cpu_stats) != 0: cpu_stats_list.append(np.mean(cpu_stats)) if len(cpu_stats_list) != 0: avg_cpu_util = np.mean(cpu_stats_list) response = cw.put_metric_data(Namespace='AWS/EC2', MetricData=[ { 'MetricName': 'avg-cpu-util', 'Timestamp': datetime.now(), 'Value': avg_cpu_util, 'Dimensions': [ { 'Name': 'InstanceId', 'Value': 'i-078f69c8c9c0097d6' }, ], 'StorageResolution': 60, 'Unit': 'Count' }, ])
def list_workers(): instances = ec2.instances.filter(Filters=[{ 'Name': 'tag:Name', 'Values': ['worker'] }]) inservice_instances_id, worker_pool_size = get_serving_instances() instances_list = [] for instance in instances: tmp_instance = { "id": instance.id, "public_ip_address": instance.public_ip_address, "instance_type": instance.instance_type, "availability_zone": instance.placement['AvailabilityZone'], "state": instance.state['Name'], "inservice": 'Yes' if instance.id in inservice_instances_id else 'No' } instances_list.append(tmp_instance) workerLabels, workerValues, workerMax = get_num_workers_30() cpuLabels, cpuValues, cpuMax = get_avg_cpu_utilization_30() return render_template('list.html', instances=instances_list, worker_pool_size=len(inservice_instances_id), workerLabels=workerLabels, workerValues=workerValues, workerMax=workerMax, cpuLabels=cpuLabels, cpuValues=cpuValues, cpuMax=cpuMax)
def auto_check_avg_cpu_utilization(): """ Only Get The Instances SERVING THE APP, NOT JUST RUNNNING """ with app.app_context(): autoScalingConfig = AutoScalingConfig.query.first() print("auto config: " + str(autoScalingConfig)) if not autoScalingConfig: return if autoScalingConfig.isOn and not has_pending_instances(): print("auto scaling on") # only getting the instances that are serving the app _, num_workers = get_serving_instances() _, num_running_instances = get_running_instances() if num_workers != num_running_instances: return print('all the created instances in service now!') _, num_non_terminated_instances = get_non_terminated_instances() # avg util > expand_threshold all_has_cpu_util, avg_cpu_util = all_instance_has_valid_cpu_util() if not all_has_cpu_util: print('newly created worker has no cpu util yet, wait!') return if avg_cpu_util > autoScalingConfig.expand_threshold: if num_non_terminated_instances >= 8: print('number of instances created reaches limit !') return to_create = int( math.ceil((autoScalingConfig.expand_ratio - 1) * num_workers)) if to_create + num_non_terminated_instances >= 8: to_create = max(8 - num_non_terminated_instances, 0) print( "max number of workers reached! only creating {} additional workers" .format(to_create)) print( "CPU expand threshold: {} reached ---- creating {} new instances --- expand ratio: {}" .format(autoScalingConfig.expand_threshold, to_create, autoScalingConfig.expand_ratio)) for i in range(to_create): celery_create_worker() elif avg_cpu_util < autoScalingConfig.shrink_threshold: to_destroy = int(autoScalingConfig.shrink_ratio * num_workers) if to_destroy > 0: print( "CPU shrink threshold: {} reached ---- destorying {} instances --- shrink ratio: {}" .format(autoScalingConfig.shrink_threshold, to_destroy, autoScalingConfig.shrink_ratio)) random_destroy_worker(to_destroy) else: print("CPU utilization within range") elif has_pending_instances(): print('there are pending instances') else: print('auto config is off')
def random_destroy_worker(to_destroy): print("destroying worker!") workers_id, num_running_workers = get_serving_instances() if num_running_workers == 0: return False else: workers_to_destroy_id = random.sample(workers_id, to_destroy) for worker_id in workers_to_destroy_id: destroy_a_worker(worker_id)
def index(): _, num_instances = get_running_instances() _, num_workers = get_serving_instances() autoScaleOn = False config = AutoScalingConfig.query.first() if config and config.isOn: autoScaleOn = True return render_template('manualscaling.html', num_workers=num_workers, num_instances=num_instances, autoScaleOn=autoScaleOn)
def get_avg_cpu_utilization_2(): cpu_stats_list = [] inservice_instances_id, num_workers = get_serving_instances() if len(inservice_instances_id) == 0: return for instance_id in inservice_instances_id: cpu_stats = get_single_instance_cpu_util(instance_id, 120) print(str(instance_id) + ": " + str(cpu_stats)) if len(cpu_stats) != 0: cpu_stats_list.append(np.mean(cpu_stats)) if len(cpu_stats_list) != 0: avg_cpu_util = np.mean(cpu_stats_list) return avg_cpu_util return
def all_instance_has_valid_cpu_util(): cpu_stats_list = [] workers_ids, num_workers = get_serving_instances() for worker_id in workers_ids: cpu_stats = get_single_instance_cpu_util(worker_id, 120) # if this instance does not have utilization, that means it has no service print(worker_id) print(cpu_stats) if len(cpu_stats) == 0: print(str(worker_id) + " has no cpu util yet") return False, 0 cpu_stats_list.append(np.mean(cpu_stats)) avg_cpu_util = np.mean(cpu_stats_list) return True, avg_cpu_util
def index(): _, num_serving_instance = get_serving_instances() avg_cpu_util = get_avg_cpu_utilization_2() return render_template('panel.html', num_serving_instance=num_serving_instance, avg_cpu_util=avg_cpu_util)