def update_jobs_created(): jobs_created = job_manager.get_jobs_by_status("CREATED") for job in jobs_created: tasks_log.info("Found new job") job.status_id = job_manager.get_job_status_id("IN_PROGRESS") db.commit() email_manager.send_job_created_email(job)
def update_fleet(): count = 0 while count < FLEET_UPDATE_MAX_RUNS: try: tasks_log.info("Update Jobs and JobItems Fleets. Attempt " + str(count)) fleet_manager.update_fleet_capacity() except Exception, err: tasks_log.error(traceback.format_exc()) finally:
def poll_job_items_queue(): count = 0 while count < POLL_JOB_ITEMS_MAX_RUNS: try: tasks_log.info("Polling job_items queue " + str(count)) process_next_job_item() except Exception, err: tasks_log.error(traceback.format_exc()) finally:
def poll_jobs_queue(): count = 0 while count < POLL_JOBS_MAX_RUNS: try: tasks_log.info("Polling jobs created + in-progress queues " + str(count)) update_jobs_created() update_jobs_in_progress() except Exception, err: tasks_log.error(traceback.format_exc()) finally:
def add_job_item_containers(count): """ Add to instance with most containers """ i = 0; while i < count: instance_id = get_available_instance_w_most_containers(ECS_JOB_ITEMS_CLUSTER, JOB_ITEM_CONTAINERS_PER_INSTANCE) if instance_id: tasks_log.info("JobItem - Adding Container To Instance " + instance_id) ecs.start_task_on_instance(ECS_JOB_ITEMS_CLUSTER, ECS_JOB_ITEMS_TASK, instance_id) else: tasks_log.info("Attempt To Add Container Failed. No available instances found.") time.sleep(2) i+=1
def calculate_optimal_job_item_container_capacity(queue, itms_per_cntr): """ Checks size of JobItems queue and compares with constant JOB_ITEMS_PER_CONTAINER """ current_time = datetime.datetime.utcnow() last_updated_job_item_time = datetime.datetime.utcnow() #job_manager.get_last_updated_job().last_updated seconds_since_last_job_item = int((current_time - last_updated_job_item_time).total_seconds()) job_items_queue = sqs.get_queue(queue) queue_size = sqs.get_queue_size(job_items_queue.url) optimal_containers = calculate_job_item_containers(queue_size, itms_per_cntr) msg = ("SITUATION:\nCurrentTime: %s\nLast_Updated: %s\nSecondsSinceLastUpdate: " + "%s\nJobItemQueueSize: %s\nOptimalContainers: %s") msg = msg % (current_time, last_updated_job_item_time, seconds_since_last_job_item, queue_size, optimal_containers) tasks_log.info(msg) return optimal_containers
def add_job_item_containers(count): """ Add to instance with most containers """ i = 0 while i < count: instance_id = get_available_instance_w_most_containers( ECS_JOB_ITEMS_CLUSTER, JOB_ITEM_CONTAINERS_PER_INSTANCE) if instance_id: tasks_log.info("JobItem - Adding Container To Instance " + instance_id) ecs.start_task_on_instance(ECS_JOB_ITEMS_CLUSTER, ECS_JOB_ITEMS_TASK, instance_id) else: tasks_log.info( "Attempt To Add Container Failed. No available instances found." ) time.sleep(2) i += 1
def calculate_optimal_job_item_container_capacity(queue, itms_per_cntr): """ Checks size of JobItems queue and compares with constant JOB_ITEMS_PER_CONTAINER """ current_time = datetime.datetime.utcnow() last_updated_job_item_time = datetime.datetime.utcnow( ) #job_manager.get_last_updated_job().last_updated seconds_since_last_job_item = int( (current_time - last_updated_job_item_time).total_seconds()) job_items_queue = sqs.get_queue(queue) queue_size = sqs.get_queue_size(job_items_queue.url) optimal_containers = calculate_job_item_containers(queue_size, itms_per_cntr) msg = ( "SITUATION:\nCurrentTime: %s\nLast_Updated: %s\nSecondsSinceLastUpdate: " + "%s\nJobItemQueueSize: %s\nOptimalContainers: %s") msg = msg % (current_time, last_updated_job_item_time, seconds_since_last_job_item, queue_size, optimal_containers) tasks_log.info(msg) return optimal_containers
def update_jobs_fleet_containers(cluster, service, max_cnts, min_cnts, cooldown): current_containers = ecs.get_service_capacity(cluster, service) optimal_containers = calculate_optimal_job_container_capacity(cooldown) tasks_log.info("JobsContainers - Current:%s Optimal:%s" % (str(current_containers), str(optimal_containers))) if optimal_containers > current_containers and current_containers < max_cnts: tasks_log.info("Jobs - Increasing Container Capacity") ecs.set_service_capacity(cluster, service, current_containers+1) return current_containers+1 elif optimal_containers < current_containers and current_containers > min_cnts: tasks_log.info("Jobs - Reducing Container Capacity") ecs.set_service_capacity(cluster, service, current_containers-1) return current_containers-1 else: tasks_log.info("Jobs - Leaving Container Capacity Unchanged") return current_containers
def update_jobs_fleet_instances(optimal_cnts, autoscaling_group, max_inst, min_inst, cnts_per_inst): current_instances = autoscaling.get_capacity(autoscaling_group) optimal_instances = calculate_optimal_job_instance_capacity(optimal_cnts, cnts_per_inst) tasks_log.info("JobsInstances - Current:%s Optimal:%s" % (str(current_instances), str(optimal_instances))) if optimal_instances > current_instances and current_instances < max_inst: tasks_log.info("Jobs - Increasing Instance Capacity") autoscaling.increase_capacity(autoscaling_group) return current_instances+1 elif optimal_instances < current_instances and current_instances > min_inst: tasks_log.info("Jobs - Reducing Instance Capacity") autoscaling.decrease_capacity(autoscaling_group) return current_instances-1 else: tasks_log.info("Jobs - Leaving Instance Capacity Unchanged") return current_instances
def update_jobs_fleet_containers(cluster, service, max_cnts, min_cnts, cooldown): current_containers = ecs.get_service_capacity(cluster, service) optimal_containers = calculate_optimal_job_container_capacity(cooldown) tasks_log.info("JobsContainers - Current:%s Optimal:%s" % (str(current_containers), str(optimal_containers))) if optimal_containers > current_containers and current_containers < max_cnts: tasks_log.info("Jobs - Increasing Container Capacity") ecs.set_service_capacity(cluster, service, current_containers + 1) return current_containers + 1 elif optimal_containers < current_containers and current_containers > min_cnts: tasks_log.info("Jobs - Reducing Container Capacity") ecs.set_service_capacity(cluster, service, current_containers - 1) return current_containers - 1 else: tasks_log.info("Jobs - Leaving Container Capacity Unchanged") return current_containers
def update_jobs_fleet_instances(optimal_cnts, autoscaling_group, max_inst, min_inst, cnts_per_inst): current_instances = autoscaling.get_capacity(autoscaling_group) optimal_instances = calculate_optimal_job_instance_capacity( optimal_cnts, cnts_per_inst) tasks_log.info("JobsInstances - Current:%s Optimal:%s" % (str(current_instances), str(optimal_instances))) if optimal_instances > current_instances and current_instances < max_inst: tasks_log.info("Jobs - Increasing Instance Capacity") autoscaling.increase_capacity(autoscaling_group) return current_instances + 1 elif optimal_instances < current_instances and current_instances > min_inst: tasks_log.info("Jobs - Reducing Instance Capacity") autoscaling.decrease_capacity(autoscaling_group) return current_instances - 1 else: tasks_log.info("Jobs - Leaving Instance Capacity Unchanged") return current_instances
def update_job_items_fleet_containers(cluster, queue, max_cntrs, min_cntrs, itms_per_cntr): """ Slow scale up. No scale down (tasks will die after they complete) """ current_containers = ecs.get_total_tasks_in_cluster(cluster) optimal_containers = calculate_optimal_job_item_container_capacity(queue, itms_per_cntr) tasks_log.info("JobItemsContainers - Current:%s Optimal:%s" % (str(current_containers), str(optimal_containers))) if optimal_containers > current_containers and current_containers < max_cntrs: tasks_log.info("JobItems - Increasing Container Capacity") add_job_item_containers(current_containers+1) return optimal_containers elif optimal_containers < current_containers and current_containers > min_cntrs: tasks_log.info("JobItems - Reducing Container Capacity") remove_job_item_containers(current_containers-1) return current_containers-1 else: tasks_log.info("JobItems - Leaving Container Capacity Unchanged") return optimal_containers
def update_job_items_fleet_instances(autoscaling_grp, max_insts, min_insts, cntrs_per_inst, cntrs): """ Fast scale up. Slow scale down. """ current_instances = autoscaling.get_capacity(autoscaling_grp) optimal_instances = calculate_optimal_job_item_instance_capacity(cntrs_per_inst, cntrs) tasks_log.info("JobItemsInstances: Current:%s Optimal:%s" % (str(current_instances), str(optimal_instances))) if optimal_instances > current_instances and current_instances < max_insts: tasks_log.info("JobItems - Increasing Instance Capacity") autoscaling.increase_capacity(autoscaling_grp) return current_instances+1 elif optimal_instances < current_instances and current_instances > min_insts: tasks_log.info("JobItems - Reducing Instance Capacity") autoscaling.decrease_capacity(autoscaling_grp) return current_instances-1 else: tasks_log.info("JobItems - Leaving Instance Capacity Unchanged") return current_instances
def update_job_items_fleet_instances(autoscaling_grp, max_insts, min_insts, cntrs_per_inst, cntrs): """ Fast scale up. Slow scale down. """ current_instances = autoscaling.get_capacity(autoscaling_grp) optimal_instances = calculate_optimal_job_item_instance_capacity( cntrs_per_inst, cntrs) tasks_log.info("JobItemsInstances: Current:%s Optimal:%s" % (str(current_instances), str(optimal_instances))) if optimal_instances > current_instances and current_instances < max_insts: tasks_log.info("JobItems - Increasing Instance Capacity") autoscaling.increase_capacity(autoscaling_grp) return current_instances + 1 elif optimal_instances < current_instances and current_instances > min_insts: tasks_log.info("JobItems - Reducing Instance Capacity") autoscaling.decrease_capacity(autoscaling_grp) return current_instances - 1 else: tasks_log.info("JobItems - Leaving Instance Capacity Unchanged") return current_instances
def update_job_items_fleet_containers(cluster, queue, max_cntrs, min_cntrs, itms_per_cntr): """ Slow scale up. No scale down (tasks will die after they complete) """ current_containers = ecs.get_total_tasks_in_cluster(cluster) optimal_containers = calculate_optimal_job_item_container_capacity( queue, itms_per_cntr) tasks_log.info("JobItemsContainers - Current:%s Optimal:%s" % (str(current_containers), str(optimal_containers))) if optimal_containers > current_containers and current_containers < max_cntrs: tasks_log.info("JobItems - Increasing Container Capacity") add_job_item_containers(current_containers + 1) return optimal_containers elif optimal_containers < current_containers and current_containers > min_cntrs: tasks_log.info("JobItems - Reducing Container Capacity") remove_job_item_containers(current_containers - 1) return current_containers - 1 else: tasks_log.info("JobItems - Leaving Container Capacity Unchanged") return optimal_containers
def calculate_optimal_job_container_capacity(cooldown): seconds_since_last_job = get_seconds_since_last_job_run(cooldown) jobs_in_flight = (len(job_manager.get_jobs_by_status("IN_PROGRESS")) + len(job_manager.get_jobs_by_status("CREATED"))) msg = ("SITUATION:\nSecondsSinceLastJobRun: %s\nJobCount: %s") msg = msg % (seconds_since_last_job, jobs_in_flight) tasks_log.info(msg) if jobs_in_flight == 0: tasks_log.info("no jobs in flight") if seconds_since_last_job > cooldown: return 0 else: return 1 else: tasks_log.info("found jobs in flight") return 1
def process_next_job_item(): tasks_log.info("Getting next job_item from queue") queue = sqs.get_queue(client_constants.SQS_JOB_ITEMS_QUEUE) msg = sqs.get_next_message(queue) if msg is None: tasks_log.info("No job items found in Queue") return job_item_key = msg['MessageAttributes']['job_item_key']['StringValue'] tasks_log.info("Found new job_item " + job_item_key) job_item = job_item_manager.get_job_item_doc(job_item_key) job_item['attempts'] += 1 status = job_item_manager.process_job_item(job_item) if status == "COMPLETE": items_log.info("Deleting completed job_item from queue") sqs.delete_message(queue, msg) else: # We are going to let SQS handle retries items_log.info("Leaving job_item in queue")
def calculate_optimal_job_item_instance_capacity(cntrs_per_inst, opt_cntrs): optimal_instances = int(math.ceil(float(opt_cntrs) / cntrs_per_inst)) msg = "SITUATION:\nOptimalJobItemContainers: %s\nJobItemContainerPerInstance: %s\nOptimalJobItemInstances: %s" msg = msg % (opt_cntrs, cntrs_per_inst, optimal_instances) tasks_log.info(msg) return optimal_instances