def start_gc_thread(): garbage_collector = GarbageCollector(10) gc_thread = threading.Thread(garbage_collector.collect_exited_containers()) gc_thread.daemon = True gc_thread.start() SysOut.out_string("Garbage collector started")
def on_put(self, req, res): """ PUT: /status?token={None} """ if not Definition.get_str_token() in req.params: res.body = "Token is required." res.content_type = "String" res.status = falcon.HTTP_401 return if req.params[Definition.get_str_token()] == Setting.get_token(): raw = str(req.stream.read(), 'UTF-8') data = eval(raw) LookUpTable.update_worker(data) SysOut.debug_string("Update worker status ({0})".format( data[Definition.get_str_node_name()])) res.body = "Okay" res.content_type = "String" res.status = falcon.HTTP_200 else: res.body = "Invalid token ID." res.content_type = "String" res.status = falcon.HTTP_401
def handle(self): # Receive and interpret the request data data = bytearray() """ Discard heading for now data += self.request.recv(16) # Interpret the header for file size file_size = struct.unpack(">Q", data[8:16])[0] """ try: c = True while c != b"": c = self.request.recv(2048) data += c # Extract byte header 3 bytes image_name_length = int(data[0:3].decode('UTF-8')) tcr = image_name_length + 3 image_name_string = data[3:tcr].decode('UTF-8') # Then, push data messaging system. MessagesQueue.push_to_queue(image_name_string, data[tcr:]) except: from harmonicIO.general.services import Services SysOut.err_string("Insufficient memory for storing g object.")
def on_post(self, req, res): # check token and request type is provided req_raw = (str(req.stream.read(req.content_length or 0), 'utf-8')) # create dict of body data if they exist req_data = json.loads(req_raw) if not Definition.get_str_token() in req.params: res.body = "Token is required." res.content_type = "String" res.status = falcon.HTTP_401 return if not "type" in req.params: res.body = "No command specified." res.content_type = "String" res.status = falcon.HTTP_406 return # request to create new job - create ID for job, add to lookup table, queue creation of the job if req.params['type'] == 'new_job': job = new_job( req_data) # attempt to create new job from provided parameters if not job: SysOut.err_string("New job could not be added!") format_response_string(res, falcon.HTTP_500, "Could not create job.") return job_status = job.get('job_status') format_response_string( res, falcon.HTTP_200, "Job request received, container status: {}\nJob ID: {}". format(job_status, job.get('job_id'))) return return
def find_available_worker(self, container): candidates = [] workers = LookUpTable.Workers.verbose() SysOut.debug_string("Found workers: " + str(workers)) if not workers: return None # loop through workers and make tuples of worker IP, load and if requested container is available locally for worker in workers: curr_worker = workers[worker] if container in curr_worker[Definition.REST.get_str_local_imgs()]: candidates.append( ((curr_worker[Definition.get_str_node_addr()], curr_worker[Definition.get_str_node_port()]), curr_worker[Definition.get_str_load5()], True)) else: candidates.append( ((curr_worker[Definition.get_str_node_addr()], curr_worker[Definition.get_str_node_port()]), curr_worker[Definition.get_str_load5()], False)) candidates.sort( key=lambda x: (-x[2], x[1]) ) # sort candidate workers first on availability of image, then on load (avg load last 5 mins) for candidate in list(candidates): if not float(candidate[1]) < 0.5: candidates.remove( candidate ) # remove candidates with higher than 50% cpu load return candidates
def __get_stream_end_point(self, container_name, container_os, priority, digest): """ Request for the stream end point from the master. :return: Boolean(False) when the system is busy. Tuple(batch_addr, batch_port, tuple_id) if the batch or messaging system is available. """ if not priority: priority = 0 else: if not isinstance(priority, int): LocalError.err_invalid_priority_type() try: url = self.__str_push_request + Definition.Master.get_str_push_req_container_ext( container_name, container_os, priority, self.__source_name, digest) print('Sending request..') print(url) response = self.__connector.request('GET', url) # print(response.status) # print(response.text) if response.status == 406: # Messages in queue is full. Result in queue lock. SysOut.warn_string("Queue in master is full.") return False if response.status == 500: SysOut.warn_string( "System internal error! Please consult documentation.") return False elif response.status != 200: SysOut.warn_string("something else went wrong") return False except Exception as ex: print(ex) SysOut.err_string( "Couldn't connect to the master at {0}:{1}.".format( self.__master_addr, self.__master_port)) return False try: content = eval(response.data.decode('utf-8')) return content except: SysOut.warn_string("JSON content error from the master!") return False
def __push_stream_end_point_MS(self, t_addr, t_port, data, image_name): """ Create a client socket to connect to server :param target: Tuple with three parameter from the endpoint request :param data: ByteArray which holds the content to be streamed to the batch. :return: Boolean return status """ try: s = None for res in socket.getaddrinfo(t_addr, t_port, socket.AF_UNSPEC, socket.SOCK_STREAM): af, socktype, proto, canonname, sa = res try: s = socket.socket(af, socktype, proto) except OSError as msg: print(msg) s = None continue try: s.connect(sa) except OSError as msg: print(msg) s.close() s = None continue break if s is None: SysOut.warn_string("Cannot connect to " + t_addr + ":" + str(t_port) + "!") return False # Generate header string image_name_b = bytes(image_name, 'UTF-8') image_name_l = str(len(image_name_b)) while len(image_name_l) < 3: image_name_l = "0" + image_name_l image_name_t = bytes(image_name_l, 'UTF-8') + image_name_b with s: # Identifying object id s.sendall(image_name_t) s.sendall(data) s.sendall(b'') s.close() return True except: SysOut.warn_string("Cannot stream data to an end point!")
def collect_exited_containers(self): while True: sleep(self.gc_run_interval) exited_containers = [] current_containers = DockerService.get_containers_status() for cont in current_containers: # find exited containers if cont.get(Definition.Container.Status.get_str_status()) == 'exited': exited_containers.append(cont.get(Definition.Container.Status.get_str_sid())) for sid in exited_containers: if not DockerService.delete_container(sid): SysOut.debug_string("Could not delete target container: {}".format(sid))
def start_job(self, target, job_data): # send request to worker worker_url = "http://{}:{}/docker?token=None&command=create".format( target[0], target[1]) req_data = bytes(json.dumps(job_data), 'utf-8') resp = urlopen( worker_url, req_data ) # NOTE: might need increase in timeout to allow download of large container images!!! if resp.getcode() == 200: # container was created sid = str(resp.read(), 'utf-8') SysOut.debug_string("Received sid from container: " + sid) return sid return False
def update_job(request): job_id = request.get('job_id') if not job_id in LookUpTable.Jobs.__jobs: SysOut.warn_string( "Couldn't update job, no existing job matching ID!") return False tkn = request.get(Definition.get_str_token()) if not tkn == LookUpTable.Jobs.__jobs[job_id]['user_token']: SysOut.warn_string("Incorrect token, refusing update.") return False old_job = LookUpTable.Jobs.__jobs[job_id] old_job['job_status'] = request.get('job_status') return True
def run_queue_manager(manager): """ Run job queue manager thread can be several managers to manage large amount of queued jobs """ import threading for i in range(manager.queuer_threads): manager_thread = threading.Thread(target=manager.job_queuer) manager_thread.daemon = True manager_thread.start() SysOut.out_string("Job queue started") if Setting.get_autoscaling(): supervisor_thread = threading.Thread(target=manager.queue_supervisor) supervisor_thread.daemon = True supervisor_thread.start() SysOut.out_string("Autoscaling supervisor started")
def on_put(self, req, res): """ PUT: /status?token={None} """ if not Definition.get_str_token() in req.params: res.body = "Token is required." res.content_type = "String" res.status = falcon.HTTP_401 return if Definition.Docker.get_str_finished() in req.params: # a container is shutting down, update containers # TODO: add some kind of safety mechanism to really make sure no new requests have been sent to this container before acknowledging removal? if LookUpTable.remove_container( req.params.get( Definition.Container.get_str_con_image_name()), req.params.get(Definition.Docker.get_str_finished())): format_response_string(res, falcon.HTTP_200, "Container successfully removed") # NOTE: container will terminate as soon as it reads this response! else: format_response_string( res, falcon.HTTP_400, "Could not remove container from table!") # NOTE: container will continue as before when it reads this response! return if req.params[Definition.get_str_token()] == Setting.get_token(): data = json.loads( str(req.stream.read(req.content_length or 0), 'utf-8')) LookUpTable.update_worker(data) SysOut.debug_string("Update worker status ({0})".format( data[Definition.get_str_node_name()])) res.body = "Okay" res.content_type = "String" res.status = falcon.HTTP_200 else: res.body = "Invalid token ID." res.content_type = "String" res.status = falcon.HTTP_401 return
def __init__(self): self.__ports = [] self.__client = docker.from_env() SysOut.out_string("Docker master initialization complete.") # Define port status for port_num in range(Setting.get_data_port_start(), Setting.get_data_port_stop()): self.__ports += [ChannelStatus(port_num)] # Check number of available port available_port = 0 for item in self.__ports: if item.status == CStatus.AVAILABLE: available_port += 1 self.__available_port = available_port
def run_container(self, container_name): def get_ports_setting(expose, ports): return {str(expose) + '/tcp': ports} def get_env_setting(expose, a_port): ret = dict() ret[Definition.Docker.HDE.get_str_node_name()] = container_name ret[Definition.Docker.HDE.get_str_node_addr( )] = Setting.get_node_addr() ret[Definition.Docker.HDE.get_str_node_data_port()] = expose ret[Definition.Docker.HDE.get_str_node_forward_port()] = a_port ret[Definition.Docker.HDE.get_str_master_addr( )] = Setting.get_master_addr() ret[Definition.Docker.HDE.get_str_master_port( )] = Setting.get_master_port() ret[Definition.Docker.HDE.get_str_std_idle_time( )] = Setting.get_std_idle_time() ret[Definition.Docker.HDE.get_str_token()] = Setting.get_token() return ret port = self.__get_available_port() expose_port = 80 if not port: SysOut.err_string("No more port available!") return False else: print('starting container ' + container_name) res = self.__client.containers.run( container_name, detach=True, stderr=True, stdout=True, ports=get_ports_setting(expose_port, port), environment=get_env_setting(expose_port, port)) import time time.sleep(1) print('..created container, logs:') print(res.logs(stdout=True, stderr=True)) if res: SysOut.out_string("Container " + container_name + " is created!") SysOut.out_string("Container " + container_name + " is " + res.status + " ") return True else: SysOut.out_string("Container " + container_name + " cannot be created!") return False
def run_msg_service(): """ Run msg service to eliminate back pressure """ from .configuration import Setting from .server_socket import ThreadedTCPServer, ThreadedTCPRequestHandler import threading server = ThreadedTCPServer((Setting.get_node_addr(), Setting.get_data_port_start()), ThreadedTCPRequestHandler, bind_and_activate=True) # Start a thread with the server -- that thread will then start one server_thread = threading.Thread(target=server.serve_forever) # Exit the server thread when the main thread terminates server_thread.daemon = True SysOut.out_string("Enable Messaging System on port: " + str(Setting.get_data_port_start())) server_thread.start() """ Have to test for graceful termination. """
def update_worker_status(): """ Update the worker status to the master as well as container info. """ threading.Timer(5, update_worker_status).start() """ Get machine status by calling a unix command and fetch for load average """ content = Services.get_machine_status(Setting, CRole.WORKER) content[Definition.REST.get_str_docker( )] = DockerService.get_containers_status() content[Definition.REST.get_str_local_imgs( )] = DockerService.get_local_images() s_content = bytes(json.dumps(content), 'utf-8') html = urllib3.PoolManager() try: r = html.request('PUT', Definition.Master.get_str_check_master( Setting.get_master_addr(), Setting.get_master_port(), Setting.get_token()), body=s_content) if r.status != 200: SysOut.err_string("Cannot update worker status to the master!") else: SysOut.debug_string("Reports status to master node complete.") except Exception as e: SysOut.err_string("Master is not available!") print(e)
def new_job(request): new_item = {} new_id = request.get('job_id') if not new_id: SysOut.warn_string("Couldn't create job, no ID provided!") return False if new_id in LookUpTable.Jobs.__jobs: SysOut.warn_string( "Job already exists in system, can't create!") return False new_item['job_id'] = new_id new_item['job_status'] = request.get('job_status') new_item[ Definition.Container.get_str_con_image_name()] = request.get( Definition.Container.get_str_con_image_name()) new_item['user_token'] = request.get(Definition.get_str_token()) new_item['volatile'] = request.get('volatile') LookUpTable.Jobs.__jobs[new_id] = new_item return True
def set_node_addr(addr=None): if addr: Setting.__node_addr = addr else: import socket from harmonicIO.general.services import Services Setting.__node_addr = socket.gethostname() # if addr is valid if Services.is_valid_ipv4( Setting.__node_addr) or Services.is_valid_ipv6( Setting.__node_addr): return None # if addr is not valid Setting.__node_addr = Services.get_host_name_i() if Services.is_valid_ipv4( Setting.__node_addr) or Services.is_valid_ipv6( Setting.__node_addr): return None SysOut.terminate_string("Cannot get node ip address!")
def job_queuer(self): while True: job_data = JobQueue.q.get() num_of_conts = job_data.get('num') job_sids = [] targets = self.find_available_worker( job_data.get(Definition.Container.get_str_con_image_name())) SysOut.debug_string("Candidate workers: " + str(targets)) n = 0 while len(job_sids) < num_of_conts: target = targets[n][0] SysOut.debug_string("Attempting to send request to worker: " + str(target)) try: sid = self.start_job(target, job_data) if sid: job_sids.append(sid) else: # not sure how urllib handles a 400 response, but this needs to happen either in case of exception or sid = False if n < len( targets) - 1: # other candidates are available n += 1 continue else: job_data['job_status'] = JobStatus.FAILED break if len(job_sids) == num_of_conts: job_data['job_status'] = JobStatus.READY job_data[Definition.Container.Status.get_str_sid( )] = job_sids #TODO: add this in metatable except: SysOut.debug_string( "Response from worker threw exception!") if n < len(targets) - 1: # other candidates are available SysOut.usr_string( "We got to other candidates available!!!!!!! -------------------------------------" ) n += 1 continue else: job_data['job_status'] = JobStatus.FAILED break # break makes it stop trying to create new containers as soon as one fails, is this desireable? Probaby as now it is unlikely that there is any hosting capability ## NOTE: can get really ugly, need to cleanup containers that started (rollback) OR let user know how many were started instead?? or retry failed ones? LookUpTable.Jobs.update_job(job_data) JobQueue.q.task_done()
def err_invalid_data_container_type(): SysOut.terminate_string( "Invalid data type! Require ByteArray, but got others")
def err_invalid_priority_type(): SysOut.terminate_string( "Invalid priority type! Require int, but got others!")
def err_invalid_max_try_type(): SysOut.terminate_string( "Invalid max_try type! Require int, but got others!")
def send_data(self, container_name, container_os, data, priority=None): # The data must be byte array if not isinstance(data, bytearray): LocalError.err_invalid_data_container_type() if len(data) == 0: SysOut.err_string("No content in byte array.") return None digest = hashlib.md5(data).hexdigest() end_point = None counter = self.__max_try while not end_point: end_point = self.__get_stream_end_point(container_name, container_os, priority, digest) counter -= 1 if counter == 0: SysOut.err_string( "Cannot contact server. Exceed maximum retry {0}!".format( self.__max_try)) return False # Send data to worker for processing directly counter = self.__max_try if end_point[Definition.get_str_node_role()] == CRole.WORKER: while not self.__push_stream_end_point( end_point[Definition.get_str_node_addr()], end_point[Definition.get_str_node_port()], data): time.sleep(self.__std_idle_time) counter -= 1 if counter == 0: SysOut.err_string( "Cannot contact server. Exceed maximum retry {0}!". format(self.__max_try)) return False # Send data to master for queuing (?) elif end_point[ Definition.get_str_node_role()] == CRole.MESSAGING_SYSTEM: while not self.__push_stream_end_point_MS( end_point[Definition.get_str_node_addr()], end_point[Definition.get_str_node_port()], data, container_name): time.sleep(self.__std_idle_time) counter -= 1 if counter == 0: SysOut.err_string( "Cannot contact server. Exceed maximum retry {0}!". format(self.__max_try)) return False else: return False if end_point[Definition.get_str_node_role()] == CRole.WORKER: SysOut.out_string( "Push data to worker ({0}:{1}>{2}) successful.".format( end_point[Definition.get_str_node_addr()], end_point[Definition.get_str_node_port()], container_name)) elif end_point[ Definition.get_str_node_role()] == CRole.MESSAGING_SYSTEM: SysOut.out_string( "Push data to messaging system ({0}:{1}>{2}) successful.". format(end_point[Definition.get_str_node_addr()], end_point[Definition.get_str_node_port()], container_name)) else: SysOut.out_string( "Push data to unknown ({0}:{1}>{2}) successful.".format( end_point[Definition.get_str_node_addr()], end_point[Definition.get_str_node_port()], container_name))
def err_invalid_token_type(): SysOut.terminate_string( "Invalid token data type! Require string, but got others!")
def err_invalid_ip(): SysOut.terminate_string("Invalid Server IP address!")
def err_invalid_port(): SysOut.terminate_string( "Invalid port data type! Require int, but got others!")
def read_cfg_from_file(): from harmonicIO.general.services import Services if not Services.is_file_exist('harmonicIO/worker/configuration.json'): SysOut.terminate_string( 'harmonicIO/worker/configuration.json does not exist') else: with open('harmonicIO/worker/configuration.json', 'rt') as t: import json cfg = json.loads(t.read()) try: from harmonicIO.general.definition import Definition # Check for the json structure if Definition.get_str_node_name() in cfg and \ Definition.get_str_node_port() in cfg and \ Definition.get_str_data_port_range() in cfg and \ Definition.get_str_idle_time() in cfg and \ Definition.get_str_master_addr() in cfg and \ Definition.get_str_master_port() in cfg and \ Definition.get_str_node_external_addr() in cfg and \ Definition.get_str_node_internal_addr(): # Check port number is int or not if not isinstance(cfg[Definition.get_str_node_port()], int): SysOut.terminate_string( "Node port must be integer.") elif not isinstance( cfg[Definition.get_str_data_port_range()], list): SysOut.terminate_string("Port range must be list.") elif not (isinstance(cfg[Definition.get_str_data_port_range()][0], int) and \ isinstance(cfg[Definition.get_str_data_port_range()][1], int)): SysOut.terminate_string( "Port range must be integer.") elif not isinstance( cfg[Definition.get_str_master_port()], int): SysOut.terminate_string( "Master port must be integer.") elif len(cfg[ Definition.get_str_data_port_range()]) != 2: SysOut.terminate_string( "Port range must compost of two elements: start, stop." ) elif not isinstance( cfg[Definition.get_str_idle_time()], int): SysOut.terminate_string( "Idle time must be integer.") elif cfg[Definition.get_str_data_port_range()][0] > \ cfg[Definition.get_str_data_port_range()][1]: SysOut.terminate_string( "Start port range must greater than stop port range." ) else: Setting.set_node_addr() import multiprocessing Setting.__node_name = cfg[ Definition.get_str_node_name()].strip() Setting.__node_port = cfg[ Definition.get_str_node_port()] Setting.__node_data_port_start = cfg[ Definition.get_str_data_port_range()][0] Setting.__node_data_port_stop = cfg[ Definition.get_str_data_port_range()][1] Setting.__std_idle_time = cfg[ Definition.get_str_idle_time()] Setting.__master_addr = cfg[ Definition.get_str_master_addr()].strip() Setting.__master_port = cfg[ Definition.get_str_master_port()] Setting.__node_external_addr = cfg[ Definition.get_str_node_external_addr()].strip( ).lower() # Check for auto node name if Setting.__node_name.lower() == "auto": # Get node name from host name import socket Setting.__node_name = socket.gethostname() # Check for overriding node address if cfg[Definition.get_str_node_internal_addr()] and \ cfg[Definition.get_str_node_internal_addr()] != "auto": # Set node name automatically from hostname from harmonicIO.general.services import Services if Services.is_valid_ipv4(cfg[Definition.get_str_node_internal_addr()]) or \ Services.is_valid_ipv6(cfg[Definition.get_str_node_internal_addr()]): Setting.__node_internal_addr = cfg[ Definition.get_str_node_internal_addr( )] # Check for node address validity if Setting.get_node_external_addr() != "none": from harmonicIO.general.services import Services if Services.is_valid_ipv4(Setting.get_node_external_addr()) or \ Services.is_valid_ipv6(Setting.get_node_external_addr()): SysOut.out_string( "By pass request with external address." ) else: SysOut.terminate_string( "Invaliid external ip address!") else: Setting.__node_external_addr = None SysOut.out_string("Load setting successful.") else: SysOut.terminate_string( "Required parameters are not present.") except Exception as e: print(e) SysOut.terminate_string( "Invalid data in configuration file.")
# Generate a sample stream order stream_order = [0] * ITEM_NUMBER import random for i in range(ITEM_NUMBER): stream_order[i] = (i, 'daemon_test' if (random.randrange(1, 100) % len(d_list)) == 0 else 'daemon_test') return stream_order, d_list if __name__ == '__main__': # Initialize connector driver SysOut.out_string("Running Harmonic Stream Connector") sc = StreamConnector(MASTER_DATA["MASTER_ADDR"], MASTER_DATA["MASTER_PORT"], token=SETTING["TOKEN"], std_idle_time=SETTING["IDLE_TIME"], max_try=SETTING["MAX_TRY"], source_name=SETTING["SOURCE_NAME"]) if sc.is_master_alive(): SysOut.out_string( "Connection to the master ({0}:{1}) is successful.".format( MASTER_DATA["MASTER_ADDR"], MASTER_DATA["MASTER_PORT"])) else: SysOut.terminate_string("Master at ({0}:{1}) is not alive!".format( MASTER_DATA["MASTER_ADDR"], MASTER_DATA["MASTER_PORT"]))
def __check_for_scale(): tmp = "MSGs " for key, value in MessagesQueue.__msg_queue.items(): tmp += "({0} -> {1}) ".format(key, len(value)) SysOut.debug_string(tmp)
server_thread.daemon = True SysOut.out_string("Enable Messaging System on port: " + str(Setting.get_data_port_start())) server_thread.start() """ Have to test for graceful termination. """ # server.shutdown() # server.server_close() if __name__ == '__main__': """ Entry point """ SysOut.out_string("Running Harmonic Master") # Load configuration from file from .configuration import Setting Setting.read_cfg_from_file() # Print instance information SysOut.out_string("Node name: " + Setting.get_node_name()) SysOut.out_string("Node address: " + Setting.get_node_addr()) SysOut.out_string("Node port: " + str(Setting.get_node_port())) # Create thread for handling REST Service from concurrent.futures import ThreadPoolExecutor pool = ThreadPoolExecutor() # Run messaging system service