def update_worker_status(): """ Update the worker status to the master as well as container info. """ threading.Timer(5, update_worker_status).start() """ Get machine status by calling a unix command and fetch for load average """ content = Services.get_machine_status(Setting, CRole.WORKER) content[Definition.REST.get_str_docker( )] = DockerService.get_containers_status() content[Definition.REST.get_str_local_imgs( )] = DockerService.get_local_images() s_content = bytes(json.dumps(content), 'utf-8') html = urllib3.PoolManager() try: r = html.request('PUT', Definition.Master.get_str_check_master( Setting.get_master_addr(), Setting.get_master_port(), Setting.get_token()), body=s_content) if r.status != 200: SysOut.err_string("Cannot update worker status to the master!") else: SysOut.debug_string("Reports status to master node complete.") except Exception as e: SysOut.err_string("Master is not available!") print(e)
def find_available_worker(self, container): candidates = [] workers = LookUpTable.Workers.verbose() SysOut.debug_string("Found workers: " + str(workers)) if not workers: return None # loop through workers and make tuples of worker IP, load and if requested container is available locally for worker in workers: curr_worker = workers[worker] if container in curr_worker[Definition.REST.get_str_local_imgs()]: candidates.append( ((curr_worker[Definition.get_str_node_addr()], curr_worker[Definition.get_str_node_port()]), curr_worker[Definition.get_str_load5()], True)) else: candidates.append( ((curr_worker[Definition.get_str_node_addr()], curr_worker[Definition.get_str_node_port()]), curr_worker[Definition.get_str_load5()], False)) candidates.sort( key=lambda x: (-x[2], x[1]) ) # sort candidate workers first on availability of image, then on load (avg load last 5 mins) for candidate in list(candidates): if not float(candidate[1]) < 0.5: candidates.remove( candidate ) # remove candidates with higher than 50% cpu load return candidates
def on_put(self, req, res): """ PUT: /status?token={None} """ if not Definition.get_str_token() in req.params: res.body = "Token is required." res.content_type = "String" res.status = falcon.HTTP_401 return if req.params[Definition.get_str_token()] == Setting.get_token(): raw = str(req.stream.read(), 'UTF-8') data = eval(raw) LookUpTable.update_worker(data) SysOut.debug_string("Update worker status ({0})".format( data[Definition.get_str_node_name()])) res.body = "Okay" res.content_type = "String" res.status = falcon.HTTP_200 else: res.body = "Invalid token ID." res.content_type = "String" res.status = falcon.HTTP_401
def collect_exited_containers(self): while True: sleep(self.gc_run_interval) exited_containers = [] current_containers = DockerService.get_containers_status() for cont in current_containers: # find exited containers if cont.get(Definition.Container.Status.get_str_status()) == 'exited': exited_containers.append(cont.get(Definition.Container.Status.get_str_sid())) for sid in exited_containers: if not DockerService.delete_container(sid): SysOut.debug_string("Could not delete target container: {}".format(sid))
def start_job(self, target, job_data): # send request to worker worker_url = "http://{}:{}/docker?token=None&command=create".format( target[0], target[1]) req_data = bytes(json.dumps(job_data), 'utf-8') resp = urlopen( worker_url, req_data ) # NOTE: might need increase in timeout to allow download of large container images!!! if resp.getcode() == 200: # container was created sid = str(resp.read(), 'utf-8') SysOut.debug_string("Received sid from container: " + sid) return sid return False
def on_put(self, req, res): """ PUT: /status?token={None} """ if not Definition.get_str_token() in req.params: res.body = "Token is required." res.content_type = "String" res.status = falcon.HTTP_401 return if Definition.Docker.get_str_finished() in req.params: # a container is shutting down, update containers # TODO: add some kind of safety mechanism to really make sure no new requests have been sent to this container before acknowledging removal? if LookUpTable.remove_container( req.params.get( Definition.Container.get_str_con_image_name()), req.params.get(Definition.Docker.get_str_finished())): format_response_string(res, falcon.HTTP_200, "Container successfully removed") # NOTE: container will terminate as soon as it reads this response! else: format_response_string( res, falcon.HTTP_400, "Could not remove container from table!") # NOTE: container will continue as before when it reads this response! return if req.params[Definition.get_str_token()] == Setting.get_token(): data = json.loads( str(req.stream.read(req.content_length or 0), 'utf-8')) LookUpTable.update_worker(data) SysOut.debug_string("Update worker status ({0})".format( data[Definition.get_str_node_name()])) res.body = "Okay" res.content_type = "String" res.status = falcon.HTTP_200 else: res.body = "Invalid token ID." res.content_type = "String" res.status = falcon.HTTP_401 return
def set_node_addr(addr=None): if addr: Setting.__node_addr = addr else: import socket from harmonicIO.general.services import Services Setting.__node_addr = socket.gethostname() SysOut.debug_string(Setting.__node_addr) # if addr is valid if Services.is_valid_ipv4( Setting.__node_addr) or Services.is_valid_ipv6( Setting.__node_addr): return None # if addr is not valid Setting.__node_addr = Services.get_host_name_i() if Services.is_valid_ipv4( Setting.__node_addr) or Services.is_valid_ipv6( Setting.__node_addr): return None SysOut.terminate_string("Cannot get node ip address!")
def job_queuer(self): while True: job_data = JobQueue.q.get() num_of_conts = job_data.get('num') job_sids = [] targets = self.find_available_worker( job_data.get(Definition.Container.get_str_con_image_name())) SysOut.debug_string("Candidate workers: " + str(targets)) n = 0 while len(job_sids) < num_of_conts: target = targets[n][0] SysOut.debug_string("Attempting to send request to worker: " + str(target)) try: sid = self.start_job(target, job_data) if sid: job_sids.append(sid) else: # not sure how urllib handles a 400 response, but this needs to happen either in case of exception or sid = False if n < len( targets) - 1: # other candidates are available n += 1 continue else: job_data['job_status'] = JobStatus.FAILED break if len(job_sids) == num_of_conts: job_data['job_status'] = JobStatus.READY job_data[Definition.Container.Status.get_str_sid( )] = job_sids #TODO: add this in metatable except: SysOut.debug_string( "Response from worker threw exception!") if n < len(targets) - 1: # other candidates are available SysOut.usr_string( "We got to other candidates available!!!!!!! -------------------------------------" ) n += 1 continue else: job_data['job_status'] = JobStatus.FAILED break # break makes it stop trying to create new containers as soon as one fails, is this desireable? Probaby as now it is unlikely that there is any hosting capability ## NOTE: can get really ugly, need to cleanup containers that started (rollback) OR let user know how many were started instead?? or retry failed ones? LookUpTable.Jobs.update_job(job_data) JobQueue.q.task_done()
sc = StreamConnector(MASTER_DATA["MASTER_ADDR"], MASTER_DATA["MASTER_PORT"], token=SETTING["TOKEN"], std_idle_time=SETTING["IDLE_TIME"], max_try=SETTING["MAX_TRY"], source_name=SETTING["SOURCE_NAME"]) if sc.is_master_alive(): SysOut.out_string( "Connection to the master ({0}:{1}) is successful.".format( MASTER_DATA["MASTER_ADDR"], MASTER_DATA["MASTER_PORT"])) else: SysOut.terminate_string("Master at ({0}:{1}) is not alive!".format( MASTER_DATA["MASTER_ADDR"], MASTER_DATA["MASTER_PORT"])) SysOut.debug_string( "Generating random order of data in {0} series.".format(ITEM_NUMBER)) stream_order, d_list = get_random_data() # Stream according to the random order for _, obj_type in stream_order: d_container = sc.get_data_container() # Assign data to container d_container += d_list[obj_type] sc.send_data(PROCC_DATA[obj_type], PROCC_DATA["OS"], d_container) SysOut.out_string("Finish!")
def __check_for_scale(): tmp = "MSGs " for key, value in MessagesQueue.__msg_queue.items(): tmp += "({0} -> {1}) ".format(key, len(value)) SysOut.debug_string(tmp)
def on_post(self, req, res): """ POST: /streamRequest?token=None This function invoked by the driver in micro-batch in the container. It responds with getting a stream from data source or from messaging system. """ if not Definition.get_str_token() in req.params: res.body = "Token is required." res.content_type = "String" res.status = falcon.HTTP_401 return # Check that the PE is existing or not, if not insert and respond if Definition.REST.Batch.get_str_batch_addr() in req.params and \ Definition.REST.Batch.get_str_batch_port() in req.params and \ Definition.REST.Batch.get_str_batch_status() in req.params and \ Definition.Container.get_str_con_image_name() in req.params: # Check for data type if req.params[Definition.REST.Batch.get_str_batch_port()].isdigit() and \ req.params[Definition.REST.Batch.get_str_batch_status()].isdigit(): ret = LookUpTable.Containers.get_container_object(req) # If queue contain data, ignore update and stream from queue length = MessagesQueue.get_queues_length( ret[Definition.Container.get_str_con_image_name()]) if not length: LookUpTable.Containers.update_container(ret) SysOut.debug_string("No item in queue!") res.body = "No item in queue" res.content_type = "String" res.status = falcon.HTTP_200 return if length > 0 and ret[ Definition.REST.Batch.get_str_batch_status( )] == CStatus.AVAILABLE: # ret[Definition.REST.Batch.get_str_batch_status()] = CStatus.BUSY # LookUpTable.Containers.update_container(ret) res.data = bytes( MessagesQueue.pop_queue(ret[ Definition.Container.get_str_con_image_name()])) res.content_type = "Bytes" res.status = falcon.HTTP_203 return else: # Register a new channel LookUpTable.Containers.update_container(ret) res.body = "OK" res.content_type = "String" res.status = falcon.HTTP_200 return else: res.body = "Invalid data type!" res.content_type = "String" res.status = falcon.HTTP_406 return else: res.body = "Invalid parameters!" res.content_type = "String" res.status = falcon.HTTP_406 return