def __init__(self, app): self.logs = logging.getLogger('simSparkLog') self.logs.setLevel(logging.DEBUG) fh = logging.handlers.RotatingFileHandler('/tmp/simSpark_driver.log', maxBytes=10000000, backupCount=5) fh.setLevel(logging.DEBUG) formatter = logging.Formatter( u'%(asctime)s [%(levelname)s] %(message)s') fh.setFormatter(formatter) self.logs.addHandler(fh) self.config = self.load_config() self.app = app self.driver_id = None self.port = self.config['driver_port'] self.bport = self.config['backend_port'] self.parallel_stage = self.config['parallel_stage'] self.rdds = [] self.undone = [] self.stages = [] self.listener = SparkConn(self.config['driver_host'], self.config['driver_port']) # register driver value = {'host': self.config['driver_host'], 'port': self.port} self.listener.sendMessage( self.wrap_msg(self.config['master_host'], self.config['master_port'], 'register_driver', value)) self.logs.info('Waiting for registeration feedback') while True: msg = self.listener.accept() if msg['type'] == 'register_driver_success': self.driver_id = msg['value']['id'] break # register app value = { 'host': self.config['driver_host'], 'port': self.port, 'did': self.driver_id, 'name': self.app.app_name } self.listener.sendMessage( self.wrap_msg(self.config['master_host'], self.config['master_port'], 'register_app', value)) self.logs.info('Wait for registeration feedback') while True: msg = self.listener.accept() if msg['type'] == 'resource_update': self.app.app_id = msg['value']['id'] break self.comm = backendComm(self) self.comm.start()
def run(self): # establish listener self.listener = SparkConn(self.config['master_host'], self.config['master_port']) # set up periodical signal tick(2.0, self.periodical_signal) # main loop while True: msg = self.listener.accept() self.dispensor(msg)
def run(self): self.listener = SparkConn(self.config['worker_host'], self.config['worker_port']) # a timer to set initial register tick(5.0, self.register_worker) while True: msg = self.listener.accept() if msg['type'] == 'register_worker_success': self.reg_succ_worker(msg['value']) self.logs.info('register successed.') break self.logs.info('Start the main process') global timer timer.cancel() timer = None heartbeat_tick(10.0, self.send_heartbeat) tick(5.0, self.send_executor_status) while True: msg = self.listener.accept() self.process(msg)
class Application: def __init__(self): # initialize logger self.logs = logging.getLogger('simSparkLog') self.logs.setLevel(logging.DEBUG) fh = logging.handlers.RotatingFileHandler('/tmp/simSpark_master.log', maxBytes=10000000, backupCount=5) fh.setLevel(logging.DEBUG) formatter = logging.Formatter( u'%(asctime)s [%(levelname)s] %(message)s') fh.setFormatter(formatter) self.logs.addHandler(fh) self.logs.info('simSpark master has been awaken.') self.config = self.load_config() if self.config['default_core'] < 1 and self.config[ 'default_core'] != -1: self.logs.critical('Default core(s) assigned must be positive.') sys.exit(1) self.apps = [] self.workers = [] self.drivers = [] self.executors = [] # load configuration def load_config(self): self.logs.info('<master_config.json> is about to be loaded.') config = { 'master_host': '172.21.0.12', 'master_port': 11111, 'webui_port': 8080, 'worker_timeout': 60, 'spread_out': True, 'default_core': -1, 'reaper_iteration': 15, 'executor_max_retries': 10 } try: with open('master_config.json', 'r') as jsoninput: inp = json.load(jsoninput) for k in config.keys(): if k in inp.keys(): config[k] = inp[k] except IOError: self.logs.warning( 'Failed to read configuration. Use default instead.') return config # signal sent def periodical_signal(self): msg = self.wrap_msg(self.config['master_host'], self.config['master_port'], 'check_worker_TO', None) self.listener.sendMessage(msg) tick(2.0, self.periodical_signal) def register_driver_success(self, driver): value = {'id': driver.driver_id} self.listener.sendMessage( self.wrap_msg(driver.host, driver.port, 'register_driver_success', value)) def feedback_application(self, app): el = [] for e in app.executor_list: el.append(e.executor_id) value = {'id': app.app_id, 'idle_executor': el, 'busy_executor': []} self.logs.info( 'Feed update to Application %d at address %s through port %d' % (app.app_id, app.host, app.port)) self.listener.sendMessage( self.wrap_msg(app.host, app.port, 'resource_update', value)) def feedback_worker(self, worker): value = {'id': worker.worker_id} self.listener.sendMessage( self.wrap_msg(worker.host, worker.port, 'register_worker_success', value)) def awake_ghost_worker(self, ghost_heartbeat): self.listener.sendMessage( self.wrap_msg(ghost_heartbeat['host'], ghost_heartbeat['port'], 'register_worker', None)) def feedback_executor(self, executor, oid): value = {'original': oid, 'assigned': executor.executor_id} self.listener.sendMessage( self.wrap_msg(executor.host, executor.port, 'register_executor_success', value)) def inform_application_ready(self, app): value = [] self.logs.info('executor list: %s' % str(app.executor_list)) for e in app.executor_list: value.append({ 'executor_id': e.executor_id, 'host': e.host, 'port': e.port }) self.logs.info(str(value)) self.listener.sendMessage( self.wrap_msg(app.host, app.port, 'resource_ready', value)) def feedback_ghost_executor(self, host, port, eid): self.listener.sendMessage( self.wrap_msg(host, port, 'ghost_executor', {'eid': eid})) def feedback_executor_elimination(self, executor, e_idx): value = {'eid': executor['eid'], 'success': not not e_idx} self.listener.sendMessage( self.wrap_msg(executor['host'], executor['port'], 'elimination_feedback', value)) def inform_no_resource(self, driver): self.listener.sendMessage( self.wrap_msg(driver.host, driver.port, 'no_resource', None)) def request_resource(self, wid, num, aid): w_idx = self.search_worker_by_id(wid) value = { 'number': num, 'app_id': aid, 'host': self.apps[self.search_application_by_id(aid)].host, 'port': self.apps[self.search_application_by_id(aid)].port } self.listener.sendMessage( self.wrap_msg(self.workers[w_idx].host, self.workers[w_idx].port, 'request_resource', value)) def inform_app_still_running(self, driver): self.listener.sendMessage( self.wrap_msg(driver.host, driver.port, 'app_still_running', None)) def kill_app_feedback(self, app): self.listener.sendMessage( self.wrap_msg(app.host, app.port, 'app_killed', None)) # wrap the message def wrap_msg(self, address, port, type, value): raw = {'type': type, 'value': value} wrapped = {'host': address, 'port': port, 'value': json.dumps(raw)} return wrapped # functional components def search_driver_by_id(self, did): for d in range(0, len(self.drivers)): if self.drivers[d].driver_id == did: return d return None def search_application_by_id(self, aid): for a in range(0, len(self.apps)): if self.apps[a].app_id == aid: return a return None def search_executor_by_id(self, e_id): for ex in range(0, len(self.executors)): if self.executors[ex].executor_id == e_id: return ex return None def search_worker_by_id(self, wid): for w in range(0, len(self.workers)): if self.workers[w].worker_id == wid: return w return None def search_worker_by_address(self, address): for w in range(0, len(self.workers)): if self.workers[w].host == address: return w return None def kill_executors(self, eliminate_list): self.logs.info('About to eliminate %d executors' % len(eliminate_list)) for e in eliminate_list: self.logs.info('Eliminating executor (id:%d)...' % e) e_idx = self.search_executor_by_id(e) if e_idx != None: worker_idx = self.search_worker_by_id( self.executors[e_idx].worker_id) app_idx = self.search_application_by_id( self.executors[e_idx].app_id) if worker_idx != None: if e in self.workers[worker_idx].executor_list: if not self.workers[worker_idx].alive: self.logs.warning( 'The worker [%d] supervising the executor [%d] is not alive.' % (self.executors[e_idx].worker_id, e)) self.workers[worker_idx].executor_list.remove(e) else: self.logs.warning( 'The worker [%d] does not supervise executor [%d].' % (self.executors[e_idx].worker_id, e)) else: self.logs.warning('The worker [%d] does not exist.' % (self.executors[e_idx].worker_id)) if app_idx != None: if e in self.apps[app_idx].executor_list: if self.apps[app_idx].status != 'ELIMINATED': self.logs.warning( 'The application [%d] querying the executor [%d] is still alive.' % (self.executors[e_idx].app_id, e)) self.apps[app_idx].executor_list.remove(e) else: self.logs.warning( 'The application [%d] does not own executor [%d].' % (self.executors[e_idx].app_id, e)) else: self.logs.warning('The application [%d] does not exist.' % (self.executors[e_idx].app_id)) msg = {'eid': e, 'success': True} self.listener.sendMessage( self.wrap_msg(self.executors[e_idx].host, self.executors[e_idx].port, 'elimination_feedback', msg)) todel = self.executors[e_idx] self.executors.remove(todel) del todel else: self.logs.warning('The executor [%d] does not exist.' % (e)) def check_application_ready(self, aid): app_idx = self.search_application_by_id(aid) if app_idx != None: if self.apps[app_idx].state == 'WAIT': if len(self.apps[app_idx].executor_list ) >= self.apps[app_idx].executors_req: self.logs.info('The resource request of App [%d] is met' % aid) self.inform_application_ready(self.apps[app_idx]) self.apps[app_idx].executor_list = [] else: self.logs.error('Application %d does not exist.' % (aid)) def register_executor(self, address, port, wid, eid, aid): self.logs.info( 'Executor from worker [%d] for app [%d] requests registration.' % (wid, aid)) worker_idx = self.search_worker_by_id(wid) app_idx = self.search_application_by_id(aid) if worker_idx != None: if app_idx != None: self.logs.info( 'New executor for application [%d] on worker [%d] is registered' % (aid, wid)) new_executor = ExecutorUnit(address, port, wid, aid) self.executors.append(new_executor) self.workers[worker_idx].executor_list.append(new_executor) self.apps[app_idx].executor_list.append(new_executor) self.feedback_executor(new_executor, eid) self.check_application_ready(aid) else: self.logs.error('Application %d does not exist.' % (aid)) else: self.logs.error('Worker %d does not exists.' % (wid)) # reaction to message def check_workers_heartbeat(self): self.logs.info('Checking a worker list at a length of %d' % len(self.workers)) for worker in self.workers: if worker.alive: if worker.heartbeat_expired(self.config['worker_timeout']): self.logs.warning('Worker [%d] is out of contact.' % (worker.worker_id)) worker.alive = False else: if worker.dead(self.config['worker_timeout'], self.config['reaper_iteration']): self.logs.warning( 'Worker [%d] will be buried for out of contact after several iterations.' % (worker.worker_id)) el = [] for e in worker.executor_list: el.append(e) self.kill_executors(el) self.workers.remove(worker) def register_application(self, app): self.logs.info( 'Request for registration of application [%s] received.' % (app['name'])) driver_idx = self.search_driver_by_id(app['did']) if driver_idx != None: if self.drivers[driver_idx].app_id: self.logs.critical( 'An application is already binded to driver [%d].' % (app['did'])) self.listener.sendMessage( self.wrap_msg(app['host'], app['port'], 'register_app_fail', None)) return new_app = ApplicationUnit(app['host'], app['port'], app['name'], app['did']) self.apps.append(new_app) self.drivers[driver_idx].set_app_id(new_app.app_id) self.logs.info( 'Application [%s] is binded to driver [%d] using id %d.' % (app['name'], app['did'], new_app.app_id)) self.feedback_application(new_app) else: self.logs.critical('Driver [%d] does not exist.' % (app['did'])) self.listener.sendMessage( self.wrap_msg(app['host'], app['port'], 'register_app_fail', None)) def kill_application(self, app): app_idx = self.search_application_by_id(app['id']) if app_idx != None: self.apps[app_idx].status = 'ELIMINATED' if len(self.apps[app_idx].executor_list) > 0: self.logs.warning( 'There are executors obtained by application %d.' % (app['id'])) el = [] for e in self.apps[app_idx].executor_list: el.append(e) self.kill_executors(el) driver_idx = self.search_driver_by_id(app['driver_id']) if driver_idx != None: if self.drivers[driver_idx].app_id != app['id']: self.logs.warning('Driver information not matched.') else: self.drivers[driver_idx].set_app_id() else: self.logs.warning( 'None of the drivers is binded with application [%d].' % (app['id'])) self.kill_app_feedback(self.apps[app_idx]) todel = self.apps[app_idx] self.apps.remove(todel) del todel self.logs.info('There are %d apps still running' % (len(self.apps))) else: self.logs.warning('Application %d does not exist.' % (app['id'])) def worker_heartbeat_ack(self, heartbeat): heartbeat['time'] = datetime.strptime(heartbeat['time'], '%Y-%m-%d %H:%M:%S %f') worker_idx = self.search_worker_by_id(heartbeat['id']) if worker_idx != None: if self.workers[worker_idx].host == heartbeat['host']: if not self.workers[worker_idx].alive: self.logs.info('Worker [%d] is awaken.' % (heartbeat['id'])) self.workers[worker_idx].awake() self.logs.info('Worker [%d] last heartbeat : %s, latest : %s' % (heartbeat['id'], str(self.workers[worker_idx].last_heartbeat), str(heartbeat['time']))) self.workers[worker_idx].update_heartbeat(heartbeat['time']) else: self.logs.error( 'Worker [%d] information does not match with the latest heartbeat.' % (heartbeat['id'])) else: self.logs.warning('Ghost worker {%s} revives.' % (heartbeat['host'])) self.awake_ghost_worker(heartbeat) def register_worker(self, worker): worker_idx = self.search_worker_by_address(worker['host']) if worker_idx != None: self.logs.critical('Worker {%s} already exists.' % worker['host']) self.listener.sendMessage( self.wrap_msg(worker['host'], worker['port'], 'register_worker_success', {'success': False})) return else: new_worker = WorkerUnit(worker['host'], worker['port']) self.logs.info('Worker {%s} registers as worker %d.' % (worker['host'], new_worker.worker_id)) self.workers.append(new_worker) self.feedback_worker(new_worker) def update_executors_of_worker(self, worker): worker_idx = self.search_worker_by_id(worker['id']) if worker_idx != None: for executor in worker['list']: if executor['id'] < 0: self.register_executor(worker['host'], worker['port'], worker['id'], executor['id'], executor['app_id']) else: e_idx = self.search_executor_by_id(executor['id']) if e_idx != None: self.executors[e_idx].status = executor['status'] else: self.logs.error('Executor [%d] does not exist.' % (executor['id'])) self.feedback_ghost_executor(worker['host'], worker['port'], executor['id']) else: self.logs.error('Worker [%d] does not exists.' % (worker['id'])) def eliminate_executor(self, value): executors = value['eid'] self.kill_executors(executors) def register_driver(self, driver): new_driver = DriverUnit(driver['host'], driver['port']) self.drivers.append(new_driver) self.register_driver_success(new_driver) self.logs.info( 'Driver registered. ID: %d/host: %s/port: %d' % (new_driver.driver_id, new_driver.host, new_driver.port)) def allocate_resource(self, req): if not (req['number'] > 0): self.logs.warning('Empty request from driver [%d].' % (req['driver_id'])) return d_idx = self.search_driver_by_id(req['driver_id']) if d_idx == None: self.logs.error('Unknown driver requests resource.') return if self.drivers[d_idx].app_id == None: self.logs.error( 'Driver [%d] which no applicaiton is binded to requests resource.' % req['driver_id']) return a_idx = self.search_application_by_id(self.drivers[d_idx].app_id) if a_idx == None: self.logs.error('Application [%d] does not exist.' % a_idx) return self.apps[a_idx].executors_req = req['number'] asstable = {} class WorkerHeap(): def __init__(self): self.heap = [{}] def pop(self, i): if i == 1: return if self.heap[i]['weight'] < self.heap[i // 2]['weight']: tmp = self.heap[i // 2] self.heap[i // 2] = self.heap[i] self.heap[i] = tmp self.pop(i // 2) def sink(self, i): if i * 2 >= len(self.heap): return if self.heap[i]['weight'] > self.heap[i * 2]['weight']: sink_left = True if i * 2 + 1 < len(self.heap): if self.heap[i * 2]['weight'] > self.heap[i * 2 + 1]['weight']: sink_left = False tmp = self.heap[i] self.heap[i] = self.heap[i * 2 + 1] self.heap[i * 2 + 1] = tmp self.sink(i * 2 + 1) if sink_left: tmp = self.heap[i] self.heap[i] = self.heap[i * 2] self.heap[i * 2] = tmp self.sink(i * 2) else: if i * 2 + 1 < len(self.heap): if self.heap[i * 2 + 1]['weight'] < self.heap[i]['weight']: tmp = self.heap[i] self.heap[i] = self.heap[i * 2 + 1] self.heap[i * 2 + 1] = tmp self.sink(i * 2 + 1) def insert(self, id, payload): node = {'id': str(id), 'weight': payload} self.heap.append(node) self.pop(len(self.heap) - 1) def add_payload(self): if len(self.heap) < 2: return self.heap[1]['weight'] + 1 self.sink(1) payload_heap = WorkerHeap() for w in self.workers: asstable[str(w.worker_id)] = 0 payload_heap.insert(w.worker_id, len(w.executor_list)) for i in range(0, req['number']): asstable[payload_heap.heap[1]['id']] += 1 payload_heap.add_payload() for k in asstable.keys(): self.request_resource(int(k), asstable[k], self.drivers[d_idx].app_id) def kill_driver(self, did): d_idx = self.search_driver_by_id(did) self.logs.info(str(d_idx)) if d_idx != None: if self.drivers[d_idx].app_id: self.logs.error( 'Application [%d] of driver [%d] is still running.' % (self.drivers[d_idx].app_id, did)) self.inform_app_still_running(self.drivers[d_idx]) return self.logs.error('Driver [%d] is killed.' % (did)) todel = self.drivers[d_idx] self.drivers.remove(todel) del todel else: self.logs.error('Driver [%d] does not exist.' % (did)) # # message dispensor def dispensor(self, msg): if msg['type'] == 'check_worker_TO': self.check_workers_heartbeat() # msg from application elif msg['type'] == 'register_app': self.register_application(msg['value']) elif msg['type'] == 'kill_app': self.kill_application(msg['value']) # msg from worker elif msg['type'] == 'worker_heartbeat': self.worker_heartbeat_ack(msg['value']) elif msg['type'] == 'register_worker': self.register_worker(msg['value']) elif msg['type'] == 'update_executors': self.update_executors_of_worker(msg['value']) elif msg['type'] == 'kill_executor': self.eliminate_executor(msg['value']) # # msg from driver elif msg['type'] == 'register_driver': self.register_driver(msg['value']) elif msg['type'] == 'request_resource': self.allocate_resource(msg['value']) elif msg['type'] == 'kill_driver': self.kill_driver(msg['value']) # main body def run(self): # establish listener self.listener = SparkConn(self.config['master_host'], self.config['master_port']) # set up periodical signal tick(2.0, self.periodical_signal) # main loop while True: msg = self.listener.accept() self.dispensor(msg) def __del__(self): global timer if timer: timer.cancel()
class workerBody: def __init__(self): # initialize logger self.logs = logging.getLogger('simSparkLog') self.logs.setLevel(logging.DEBUG) fh = logging.handlers.RotatingFileHandler('/tmp/simSpark_worker.log', maxBytes=10000000, backupCount=5) fh.setLevel(logging.DEBUG) formatter = logging.Formatter( u'%(asctime)s [%(levelname)s] %(message)s') fh.setFormatter(formatter) self.logs.addHandler(fh) self.logs.info('simSpark worker has been awaken.') self.config = self.load_config() self.executors = [] self.executors_status = [] self.exeid = -1 self.workerid = -1 self.maxExectuorNum = 10 self.fetchLock = None self.listener = None self.driver_listener = None self.exeLock = threading.Lock() self.appList = [] def __del__(self): global timer if timer: timer.cancel() global heartbeat_timer if heartbeat_timer: heartbeat_timer.cancel() def fetch_info(self, rddid, host, port): self.fetchLock.acquire() msg = { 'rid': rddid, 'host': self.config['worker_host'], 'port': self.config['fetch_port'], } wrapMsg = self.wrap_msg(host, port, 'fetch_info', msg) self.driver_listener.sendMessage(wrapMsg) while True: msg = self.driver_listener.accept() if msg['type'] == 'fetch_info_ack': self.fetchLock.release() return msg['value'] def fetch_data(self, rddid, pid, host, port): self.fetchLock.acquire() msg = { 'pidx': pid, 'rid': rddid, 'host': self.config['worker_host'], 'port': self.config['fetch_port'], } wrapMsg = self.wrap_msg(host, port, 'fetch_data', msg) self.driver_listener.sendMessage(wrapMsg) msg = None while True: msg = self.driver_listener.accept() if msg['type'] == 'fetch_data_ack': self.fetchLock.release() return msg['value'] # todo still need to confirm the interface def send_result(self, rddid, pid, host, port): self.fetchLock.acquire() msg = { 'host': self.config['worker_host'], 'port': self.config['worker_port'], 'pidx': pid, 'rid': rddid, 'method': 0 } wrapMsg = self.wrap_msg(host, port, 'task_finished', msg) self.driver_listener.sendMessage(wrapMsg) self.fetchLock.release() def send_data_to_driver(self, value): appid = value['appid'] rid = value['rid'] pidx = value['pidx'] dhost = value['host'] dport = value['port'] e = self.search_app_by_id(appid) ctx = self.appList[e].context result = ctx.get_partition_data(rid, pidx) wrapmsg = self.wrap_msg(dhost, dport, 'fetch_data_ack', result) self.listener.sendMessage(wrapmsg) # without changed,need change after use def load_config(self): config = { 'master_host': '172.21.0.12', 'master_port': 11111, 'worker_host': '172.21.0.14', 'worker_port': 11111, 'webui_port': 8080, 'fetch_port': 11112, 'worker_timeout': 60, 'spread_out': True, 'default_core': -1, 'reaper_iteration': 15, 'executor_max_retries': 10 } try: with open('worker_config.json', 'r') as jsoninput: inp = json.load(jsoninput) for k in config.keys(): if k in inp.keys(): config[k] = inp[k] except IOError: self.logs.warning( 'Failed to read configuration. Use default instead.') return config # todo:need the worker to send a host and port def reg_succ_worker(self, value): self.workerid = value['id'] # initialize the fetch_port lock and the driver_listener self.fetchLock = threading.Lock() self.driver_listener = SparkConn(self.config['worker_host'], self.config['fetch_port']) def reg_succ_executor(self, value): oid = value['original'] e = self.search_executor_by_id(oid) if e != None: self.executors[e].eid = value['assigned'] self.executors[e].status = 'ALIVE' self.logs.info('An executor got the new id %d' % self.executors[e].eid) else: self.logs.warning('Failed to read the right executor') def send_executor_status(self): renew_list = [] exelen = len(self.executors) for e in range(0, exelen): exe = self.executors[e] if exe.status != self.executors_status[e]: renew_list.append({ 'id': exe.eid, 'status': exe.status, 'app_id': exe.appid }) self.executors_status[e] = exe.status if not (renew_list == []): msg = { 'id': self.workerid, 'host': self.config['worker_host'], 'port': self.config['worker_port'], 'list': renew_list } wrappedmsg = self.wrap_msg(self.config['master_host'], self.config['master_port'], 'update_executors', msg) self.listener.sendMessage(wrappedmsg) # check if there is an executor is completed self.logs.info('Update executors %s' % str(renew_list)) eid_list = [] for nex in renew_list: if nex['status'] == 'COMPLETED': eid_list.append(nex['id']) if not (eid_list == []): delmsg = { 'host': self.config['worker_host'], 'port': self.config['worker_port'], 'eid': eid_list } wrapmsg = self.wrap_msg(self.config['master_host'], self.config['master_port'], 'kill_executor', delmsg) self.listener.sendMessage(wrapmsg) self.logs.info('These executors id will be killed %s' % str(eid_list)) tick(5.0, self.send_executor_status) # todo def del_executor(self, value): if value['success']: eid = value['eid'] pos = self.search_executor_by_id(eid) self.logs.info('Kill the executor with eid:%d', eid) del self.executors[pos] del self.executors_status[pos] # todo def req_executor(self, value): num = value['number'] # host = value['host'] # port = value['port'] # self.appId = value['app_id'] elist = [] for i in range(0, num): ex = executor.executor(self.exeid, value['app_id'], self.exeLock) self.executors.append(ex) self.executors_status.append(ex.status) idmsg = { 'id': self.exeid, 'status': ex.status, 'app_id': value['app_id'] } elist.append(idmsg) self.exeid -= 1 msg = { 'id': self.workerid, 'host': self.config['worker_host'], 'port': self.config['worker_port'], 'list': elist } wrapmsg = self.wrap_msg(self.config['master_host'], self.config['master_port'], 'update_executors', msg) self.listener.sendMessage(wrapmsg) def send_heartbeat(self): msg = { 'id': self.workerid, 'host': self.config['worker_host'], 'port': self.config['worker_port'], 'time': datetime.now().strftime('%Y-%m-%d %H:%M:%S %f') } wrapmsg = self.wrap_msg(self.config['master_host'], self.config['master_port'], 'worker_heartbeat', msg) self.listener.sendMessage(wrapmsg) heartbeat_tick(10.0, self.send_heartbeat) ''' def cleanCatalog(self): pass ''' def register_worker(self): worker = { 'host': self.config['worker_host'], 'port': self.config['worker_port'] } wrapped_msg = self.wrap_msg(self.config['master_host'], self.config['master_port'], 'register_worker', worker) # print wrapped_msg self.listener.sendMessage(wrapped_msg) tick(5.0, self.register_worker) def ghost_executor(self, value): pass def reregister(self): self.register_worker() # todo open the thread pool to run the executors in parallel, still need to add port and host def pending_task(self, value): eid = value['eid'] rid = value['rid'] pid = value['pidx'] appid = value['app_id'] host = value['host'] port = value['port'] app_pos = self.search_app_by_id(appid) if app_pos == None: self.add_app(appid, host, port) app_pos = len(self.appList) - 1 app = self.appList[app_pos] index = self.search_executor_by_id(eid) if index != None: self.logs.info("appis:%s, ctxis:%s" % (str(app), str(app.context))) self.executors[index].setId(rid, pid, app.context) self.logs.info("Executor %d begin" % eid) self.executors[index].start() else: self.logs.critical('Missing executor id.') # wrap the message def wrap_msg(self, address, port, type, value): raw = {'type': type, 'value': value} wrapped = {'host': address, 'port': port, 'value': json.dumps(raw)} return wrapped def search_executor_by_id(self, eid): for e in range(0, len(self.executors)): if self.executors[e].eid == eid: return e return None def search_app_by_id(self, appid): for e in range(0, len(self.appList)): if self.appList[e].appid == appid: return e return None def add_app(self, appid, host, port): napp = appInfo(appid, host, port, self) self.appList.append(napp) return napp def delete_app(self, value): appid = value['appid'] index = self.search_app_by_id(appid) if index: del self.appList[index] def process(self, msg): if msg['type'] == 'request_resource': self.req_executor(msg['value']) elif msg['type'] == 'register_worker': self.reregister() elif msg['type'] == 'register_executor_success': self.reg_succ_executor(msg['value']) elif msg['type'] == 'elimination_feedback': self.del_executor(msg['value']) elif msg['type'] == 'ghost_executor': self.ghost_executor(msg['value']) elif msg['type'] == 'pending_task': self.pending_task(msg['value']) elif msg['type'] == 'delete_app': self.delete_app(msg['value']) elif msg['type'] == 'fetch_data': self.send_data_to_driver(msg['value']) def run(self): self.listener = SparkConn(self.config['worker_host'], self.config['worker_port']) # a timer to set initial register tick(5.0, self.register_worker) while True: msg = self.listener.accept() if msg['type'] == 'register_worker_success': self.reg_succ_worker(msg['value']) self.logs.info('register successed.') break self.logs.info('Start the main process') global timer timer.cancel() timer = None heartbeat_tick(10.0, self.send_heartbeat) tick(5.0, self.send_executor_status) while True: msg = self.listener.accept() self.process(msg)
def reg_succ_worker(self, value): self.workerid = value['id'] # initialize the fetch_port lock and the driver_listener self.fetchLock = threading.Lock() self.driver_listener = SparkConn(self.config['worker_host'], self.config['fetch_port'])
class backendComm(threading.Thread): def __init__(self, ctx): threading.Thread.__init__(self) self.context = ctx self.running = threading.Event() self.running.set() def query_rdd(self, q): rdd = self.context.search_rdd_by_id(q['rid']) if not rdd: self.lis.sendMessage( self.context.wrap_msg(q['host'], q['port'], 'Non-exist_rdd', q['rid'])) else: dep = [] for d in rdd.dependencies: dep.append(d.rdd_id) value = { 'rdd_type': rdd.type, 'part_len': len(rdd.partitions), 'dependencies': dep, 'funtype': rdd.funtype, 'fun': None } if rdd.fun != None: value['fun'] = rdd.fun.__name__ self.lis.sendMessage( self.context.wrap_msg(q['host'], q['port'], 'fetch_info_ack', value)) def query_partition(self, q): rdd = self.context.search_rdd_by_id(q['rid']) if not rdd: self.lis.sendMessage( self.context.wrap_msg(q['host'], q['port'], 'Non-exist_rdd', q['rid'])) else: if q['pidx'] >= len(rdd.partitions): self.lis.sendMessage( self.context.wrap_msg(q['host'], q['port'], 'Non-exist_partition', { 'rid': q['rid'], 'pidx': q['pidx'] })) else: if not rdd.partitions[q['pidx']].fetchable: self.lis.sendMessage( self.context.wrap_msg(q['host'], q['port'], 'not_stored_partition', { 'rid': q['rid'], 'pidx': q['pidx'] })) else: self.lis.sendMessage( self.context.wrap_msg( q['host'], q['port'], 'fetch_data_ack', rdd.partitions[q['pidx']].records)) def update_task(self, u): rdd = self.context.search_rdd_by_id(u['rid']) if not rdd: self.lis.sendMessage( self.context.wrap_msg(u['host'], u['port'], 'Non-exist_rdd', u['rid'])) else: if u['pidx'] >= len(rdd.partitions): self.lis.sendMessage( self.context.wrap_msg(u['host'], u['port'], 'Non-exist_partition', { 'rid': u['rid'], 'pidx': u['pidx'] })) else: rdd.partitions[u['pidx']].update_source((u['host'], u['port'])) rdd.partitions[u['pidx']].method = u['method'] stage = self.context.search_stage_by_rdd(rdd.rdd_id) if not stage: self.context.logs.critical('Missing stage.') return stage.task_done[u['pidx']] = True if stage.done: stage.finish() self.lis.sendMessage( self.context.wrap_msg(u['host'], u['port'], 'task_finished_ack', { 'rid': u['rid'], 'pidx': u['pidx'] })) def dispense(self, msg): if msg['type'] == 'fetch_info': self.query_rdd(msg['value']) elif msg['type'] == 'fetch_data': self.query_partition(msg['value']) elif msg['type'] == 'task_finished': self.update_task(msg['value']) def run(self): self.lis = SparkConn(self.context.config['driver_host'], self.context.bport) while self.running.is_set(): msg = self.lis.accept() self.dispense(msg) def collapse(self): self.running.clear()
class simContext: def __init__(self, app): self.logs = logging.getLogger('simSparkLog') self.logs.setLevel(logging.DEBUG) fh = logging.handlers.RotatingFileHandler('/tmp/simSpark_driver.log', maxBytes=10000000, backupCount=5) fh.setLevel(logging.DEBUG) formatter = logging.Formatter( u'%(asctime)s [%(levelname)s] %(message)s') fh.setFormatter(formatter) self.logs.addHandler(fh) self.config = self.load_config() self.app = app self.driver_id = None self.port = self.config['driver_port'] self.bport = self.config['backend_port'] self.parallel_stage = self.config['parallel_stage'] self.rdds = [] self.undone = [] self.stages = [] self.listener = SparkConn(self.config['driver_host'], self.config['driver_port']) # register driver value = {'host': self.config['driver_host'], 'port': self.port} self.listener.sendMessage( self.wrap_msg(self.config['master_host'], self.config['master_port'], 'register_driver', value)) self.logs.info('Waiting for registeration feedback') while True: msg = self.listener.accept() if msg['type'] == 'register_driver_success': self.driver_id = msg['value']['id'] break # register app value = { 'host': self.config['driver_host'], 'port': self.port, 'did': self.driver_id, 'name': self.app.app_name } self.listener.sendMessage( self.wrap_msg(self.config['master_host'], self.config['master_port'], 'register_app', value)) self.logs.info('Wait for registeration feedback') while True: msg = self.listener.accept() if msg['type'] == 'resource_update': self.app.app_id = msg['value']['id'] break self.comm = backendComm(self) self.comm.start() def __del__(self): value = {'id': self.app.app_id, 'driver_id': self.driver_id} self.listener.sendMessage( self.wrap_msg(self.config['master_host'], self.config['master_port'], 'kill_app', value)) while True: msg = self.listener.accept() if msg['type'] == 'app_killed': break value = self.driver_id self.listener.sendMessage( self.wrap_msg(self.config['master_host'], self.config['master_port'], 'kill_driver', value)) self.comm.collapse() def wrap_msg(self, address, port, type, value): raw = {'type': type, 'value': value} wrapped = {'host': address, 'port': port, 'value': json.dumps(raw)} return wrapped def load_config(self): self.logs.info('<driver_config.json> is about to be loaded.') config = { 'master_host': '172.21.0.12', 'master_port': 11111, 'driver_host': '172.21.0.3', 'driver_port': 10001, 'backend_port': 10002, 'parallel_stage': 1, 'timeout': 60 } try: with open('driver_config.json', 'r') as jsoninput: inp = json.load(jsoninput) for k in config.keys(): if k in inp.keys(): config[k] = inp[k] except IOError: self.logs.warning( 'Failed to read configuration. Use default instead.') return config def parallelize(self, arr, fineness=-1): part = [] if fineness == 0 or fineness < -1: self.logs.error( 'Invalid arguments of parallelism. Failed to create RDD') return None if fineness == -1: fineness = len(arr) for i in range(0, fineness): l = len(arr) // fineness * i r = len(arr) // fineness * (i + 1) if i == fineness - 1: r = len(arr) data = arr[l:r] new_par = simPartition(self, i, data) part.append(new_par) new_rdd = simRDD(self, [], part) new_rdd._register() return new_rdd def search_stage_by_rdd(self, rid): for s in self.stages: if s.rdd.rdd_id == rid: return s return None def search_rdd_by_id(self, rid): for r in self.rdds: if r.rdd_id == rid: return r return None def resource_request(self, n=1): value = {'driver_id': self.driver_id, 'number': n} self.listener.sendMessage( self.wrap_msg(self.config['master_host'], self.config['master_port'], 'request_resource', value)) @property def ready_stages(self): ret = [] for stage in self.undone: ready = True for pstage in stage.parent_stage: if pstage in self.undone: ready = False break if ready: ret.append(stage) if self.parallel_stage > 0 and len(ret) == self.parallel_stage: return ret return ret def fetch_partition(self, source, rid, pidx, frommem): if rid == None: return None if frommem: value = { 'appid': self.app.app_id, 'host': self.config['driver_host'], 'port': self.config['driver_port'], 'rid': rid, 'pidx': pidx } self.listener.sendMessage( self.wrap_msg(source[0], source[1], 'fetch_data', value)) while True: msg = self.listener.accept() if msg['type'] == 'fetch_data_ack': return msg['value'] return None def list_clear(self, stages): for stage in stages: if not stage.done: return False return True def pend_task(self, executor, rid, pidx): value = { 'app_id': self.app.app_id, 'eid': executor['executor_id'], 'rid': rid, 'pidx': pidx, 'host': self.config['driver_host'], 'port': self.config['backend_port'] } self.listener.sendMessage( self.wrap_msg(executor['host'], executor['port'], 'pending_task', value))
def run(self): self.lis = SparkConn(self.context.config['driver_host'], self.context.bport) while self.running.is_set(): msg = self.lis.accept() self.dispense(msg)