class NodeScheduler(object): def __init__(self): interface = config.INTERFACE f = os.popen('ifconfig ' + str(interface) + ' | grep "inet\ addr" | cut -d: -f2 | cut -d" " -f1') self.identity = f.read().strip() self.queue_data_lock = threading.Lock() # Make a random hash for the queue. self.random_hash = hashlib.sha1(b'/tmp/Weasel/bin/local_resourcemanager').hexdigest() self.task_data = {} self.queue_data = {} self.max_tasks_to_run_for_queue = {} # Will hold the timestamp when we first received a task. self.time_start_running = -1 # Initially we run four tasks per CPU core. This number might # be changed by the monitor thread based on resource contention. if config.ADAPT_TASKS: self.max_tasks_to_run = multiprocessing.cpu_count() * 2 else: self.max_tasks_to_run = config.NR_COLOCATED_TASKS # Create the queue for the tasks. self.queue_data[self.random_hash] = {'qid': self.random_hash, 'asked': 0, 'recv': 0, 'tpool': ThreadPool(1, self.task_data)} tpool = self.queue_data[self.random_hash]['tpool'] self.max_tasks_to_run_for_queue[self.random_hash] = self.max_tasks_to_run tpool.set_size(self.max_tasks_to_run) tpool.start() # Create the monitoring thread. self.monitor_thread = NodeMonitor(parent=self) # Create the thread that performs communication with # the scheduler. self.sched_client_thread = ZmqConnectionThread( self.identity, zmq.DEALER, config.SCHEDULER + ":" + str(config.ZMQ_SCHEDULER_PORT), self.callback) self.running = True logfile = config.LOGDIR + "/local_scheduler.log" self.logger = WeaselLogger('local_scheduler', logfile) self.ntasks_to_ask = 1 self.task_id = 1 self.time_asked_first = time.time() self.running_task = 0 self.nr_received_tasks = 0 self.nran_tasks = [] self.queues_asked_for = [] self.current_ntasks = 1 self.has_new_task = False self.first_task = False self.running_task_lock = threading.Lock() # Will hold a map of nr_colocated_tasks => [runtimes] self.task_runtimes = {} self.task_runtime_lock = threading.Lock() self.sleep_time = config.WAITTIME self.task_time = 1 self.has_new_queue = False self.new_queues = [] self.logger.info("NodeScheduler started...") self.nrunning_past_period = [] # Will hold the last started executable. self.last_started = None def run_task(self, arg): command_id = arg['id'] command = arg['exec'] + ' ' + arg['params'] qid = arg['qid'] myid = threading.current_thread().ident # Tell the monitor thread we have started a task. self.monitor_thread.task_started(myid) # Increment the number of running tasks. self.running_task_lock.acquire() self.running_task += 1 nr_colocated = self.running_task # The first time we run a task of a different type, we reset # the history on the monitor thread. if self.last_started is None: self.last_started = arg['exec'] if self.last_started != arg['exec']: print('Started new task type: ' + str(arg['exec'])) sys.stdout.flush() self.monitor_thread.reset_known_points() self.last_started = arg['exec'] self.running_task_lock.release() start_time = time.time() proc = psutil.Popen(command, shell=True, stdout=PIPE, stderr=PIPE) self.task_data[myid]['lock'].acquire() self.task_data[myid]['proc'] = proc self.task_data[myid]['ctask'] = arg self.task_data[myid]['lock'].release() out, err = proc.communicate() return_code = proc.returncode if return_code != 0: print('Error when returning: ' + str(return_code)) sys.stdout.flush() end_time = time.time() # Record task running times. self.task_runtime_lock.acquire() running_time = end_time - start_time if nr_colocated not in self.task_runtimes: self.task_runtimes[nr_colocated] = [] self.task_runtimes[nr_colocated].append(running_time) print("Task %s ran in %s seconds (%s)" % (str(command_id), str(running_time), str(arg['exec']))) sys.stdout.flush() self.task_runtime_lock.release() # Tell the monitor thread we have finished a task. self.monitor_thread.task_finished(myid) self.task_data[myid]['lock'].acquire() self.task_data[myid]['ctask'] = None if self.task_data[myid]['task'].get(qid) == None: self.task_data[myid]['task'][qid] = [] self.external_change = True self.task_data[myid]['task'][qid].append( [end_time - start_time, 100 * (end_time - start_time) / (end_time - start_time)]) self.task_data[myid]['lock'].release() self.running_task_lock.acquire() self.running_task -= 1 self.nran_tasks.append(command_id) self.running_task_lock.release() def get_total_queue_size(self): queue_size = 0 self.queue_data_lock.acquire() for qid in self.queue_data: queue_size = queue_size + self.queue_data[qid]['tpool'].tasks.qsize() self.queue_data_lock.release() return queue_size def get_tasks_to_ask(self): """ Returns the number of tasks to ask from the scheduler. Tries to keep the queue size at least as long as the maximum allowed number of tasks at the moment. :return: """ tasks_to_ask = {} self.queues_asked_for = [] queue_size = self.get_total_queue_size() self.queue_data_lock.acquire() for qid in self.queue_data: tasks_to_ask[qid] = 0 self.queue_data[qid]['asked'] = 0 self.queue_data[qid]['recv'] = 0 qsize = self.queue_data[qid]['tpool'].tasks.qsize() if qsize > 2 * self.max_tasks_to_run_for_queue[qid] and self.max_tasks_to_run_for_queue[qid] != -1: continue if qsize == 0: tasks_to_ask[qid] = self.max_tasks_to_run_for_queue[qid] else: if qsize > self.max_tasks_to_run_for_queue[qid] and self.max_tasks_to_run_for_queue[qid] != -1: continue elif qsize < self.max_tasks_to_run_for_queue[qid]: tasks_to_ask[qid] = self.max_tasks_to_run_for_queue[qid] - qsize self.queues_asked_for.append(qid) self.queue_data[qid]['asked'] = tasks_to_ask[qid] self.queue_data_lock.release() return tasks_to_ask, queue_size def wait_and_ask(self): while self.running: # check at 0.2 seconds time.sleep(0.2) self.running_task_lock.acquire() nrunning = self.running_task self.nrunning_past_period.append(nrunning) task_data_to_send = {'ran': self.nran_tasks[:]} self.nran_tasks = [] self.running_task_lock.release() (tasks_to_ask, queue_size) = self.get_tasks_to_ask() task_data_to_send['qsize'] = queue_size * self.task_time pickled_data = pickle.dumps(task_data_to_send) if len(tasks_to_ask) > 0: self.sched_client_thread.put_request_in_queue( [self.identity, PROTOCOL_HEADERS['WORKER'], 'task', pickle.dumps(tasks_to_ask), pickled_data]) def process_task(self, task): tmp = task.split(';') task_name = tmp[-1].split()[0].split('/')[-1] new_task = False return new_task def add_task_to_queues(self, tasks): for task in tasks['tasks']: self.running_task_lock.acquire() self.nr_received_tasks += 1 self.running_task_lock.release() new_task = self.process_task(task['exec']) task_hash = hashlib.sha1(task['exec'].encode()).hexdigest() self.has_new_task |= new_task task['qid'] = task_hash self.queue_data[self.random_hash]['tpool'].add_task(self.run_task, task) def get_latest_task_type(self): """ Return the latest started task type. :return: """ self.running_task_lock.acquire() latest = self.last_started self.running_task_lock.release() return latest def running_identical_tasks(self): """ Returns whether or not the worker is currently only running tasks of the same type. :return: """ current = None task_threads = self.task_data.keys() try: for task_thread in task_threads: if (task_thread not in self.task_data) or ('lock' not in self.task_data[task_thread]): continue self.task_data[task_thread]['lock'].acquire() if 'ctask' in self.task_data[task_thread] and self.task_data[task_thread]['ctask'] is not None: if current is None: current = self.task_data[task_thread]['ctask']['exec'] elif current != self.task_data[task_thread]['ctask']['exec']: self.task_data[task_thread]['lock'].release() return False self.task_data[task_thread]['lock'].release() return True except Exception, e: print('Got exception while trying to determine identical tasks') print(e) sys.stdout.flush()
class Scheduler(object): stdin = "/dev/null" stdout = "/dev/null" stderr = "/dev/null" def __init__(self): self.running = True self.resource_manager = ResourceManager(self) self.resource_queues = {'cpu': [], 'memory': [], 'bw': []} ''' this is to have multiple applications/queues; application = queue ''' self.task_queue = [] # ready queues; [queue_id] self.task_queue_data = {} # ready queue data; queue_id: [task...] # queues with pending tasks, which do not have the input ready self.pending_queue_data = {} self.task_queue_lock = threading.Lock() self.pending_task_queue_lock = threading.Lock() logfile = config.LOGDIR + "/scheduler.log" taskfile = config.LOGDIR + "/task.log" self.logger = WeaselLogger('scheduler', logfile) self.task_logger = WeaselLogger('tasklogger', logfile) ''' this is the thread that will perform the communication with the local schedulers ''' self.server_thread = ZmqConnectionThread( 'resourcemng', zmq.ROUTER, "*:" + str( config.ZMQ_SCHEDULER_PORT), self.msg_process_callback) ''' this information is for keeping track of files and task dependencies ''' self.task_to_file = { } # taskid: {'nfiles': 0, 'ntotalfiles':x, 'outputs':[files]} # file: {'ntids': ntids, 'tids': [tids], 'ctids': executed_tasks} self.file_to_task = {} self.result_queue = Queue() self.result_consumer_thread = threading.Thread(target=self.get_result) self.result_consumer_thread.start() ''' idle worker information and client notification ''' self.waiting_clients = [] self.workers_empty_dict = {} self.workers = [] self.workers_data = {} self.workers_lock = threading.Lock() self.workers_empty = 0 self.task_id = 0 ''' to notify workers about new queues ''' self.new_queues = [] self.new_queues_lock = threading.Lock() ''' here I have: the thread that listens for messages a queue in which the tasks are put the main thread that applies some reconfiguration (?) ''' self.files_to_delete = [] def delete_queue(self, qid): del self.task_queue_data[qid] self.task_queue.remove(qid) self.new_queues_lock.acquire() try: self.new_queues.remove(qid) except: pass self.new_queues_lock.release() def get_taskids(self): tasks_ids = [] try: tasks_ids = self.result_queue.get( block=True, timeout=8 * config.WAITTIME) except: self.pending_task_queue_lock.acquire() pending_queue = self.pending_queue_data for pqueue in pending_queue: to_delete = [] for tid in pending_queue[pqueue]: current_inputs = 0 tinputs = len(pending_queue[pqueue][tid]['inputs']) for inputf in pending_queue[pqueue][tid]['inputs']: if os.path.isfile(inputf): current_inputs = current_inputs + 1 if current_inputs == tinputs: task = pending_queue[pqueue][tid] data = { 'id': tid, 'exec': task['exec'], 'params': task['params']} to_delete.append(tid) self.task_queue_lock.acquire() is_new = False try: self.task_queue_data[pqueue].append(data) except: self.task_queue.append(pqueue) self.task_queue_data[pqueue] = deque() self.task_queue_data[pqueue].append(data) is_new = True self.task_queue_lock.release() if is_new: self.new_queues_lock.acquire() self.new_queues.append(pqueue) self.new_queues_lock.release() for tid in to_delete: del self.pending_queue_data[pqueue][tid] self.pending_task_queue_lock.release() return tasks_ids def check_dependencies_per_task(self, taskid): for fileid in self.task_to_file[taskid]['outputs']: dependent_tasks = [] try: dependent_tasks = self.file_to_task[ hashlib.sha1(fileid.encode()).hexdigest()]['tids'] except: pass for taskid2 in dependent_tasks: self.task_to_file[taskid2]['cinputs'] = self.task_to_file[taskid2]['cinputs'] + 1 if self.task_to_file[taskid2]['cinputs'] == self.task_to_file[taskid2]['tinputs']: # put in ready queue self.pending_task_queue_lock.acquire() try: task = self.pending_queue_data[self.task_to_file[taskid2]['queueid']][taskid2] except: self.pending_task_queue_lock.release() continue self.pending_task_queue_lock.release() data = { 'id': taskid2, 'exec': task['exec'], 'params': task['params']} self.task_queue_lock.acquire() is_new = False try: self.task_queue_data[self.task_to_file[taskid2]['queueid']].append(data) except: self.task_queue.append(self.task_to_file[taskid2]['queueid']) # FCFS like self.task_queue_data[self.task_to_file[taskid2]['queueid']] = deque() self.task_queue_data[self.task_to_file[taskid2]['queueid']].append(data) is_new = True self.task_queue_lock.release() if is_new: self.new_queues_lock.acquire() self.new_queues.append(self.task_to_file[taskid2]['queueid']) self.new_queues_lock.release() self.pending_task_queue_lock.acquire() del self.pending_queue_data[ self.task_to_file[taskid2]['queueid']][taskid2] self.pending_task_queue_lock.release() def garbage_collect(self, taskid): for fileid in self.task_to_file[taskid]['inputs']: try: self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ctids'] = self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ctids'] + 1 if self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ctids'] == self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ntids']: # now it is safe to delete this file (is it?) try: print "deleting file ", fileid, "ctids=", \ self.file_to_task[hashlib.sha1(fileid.encode()).hexdigest()]['ctids'] \ , "ntids=", self.file_to_task[hashlib.sha1(fileid.encode()).hexdigest()]['ntids'] # os.remove(fileid) # if failed, report it back to the user ## except OSError as e: print "Error: %s - %s." % (e.filename, e.strerror) except: print "exception for file ", fileid def get_result(self): while self.running: self.pending_task_queue_lock.acquire() to_delete = [] for pqueue in self.pending_queue_data: if len(self.pending_queue_data[pqueue]) == 0: to_delete.append(pqueue) for pqueue in to_delete: del self.pending_queue_data[pqueue] self.pending_task_queue_lock.release() tasks_ids = self.get_taskids() if len(tasks_ids) == 0: continue for taskid in tasks_ids: # if missing files were generated put task in ready queue try: self.check_dependencies_per_task(taskid) except: traceback.print_exc() def msg_process_callback(self, message): message_type = message[3] try: if message_type == PROTOCOL_HEADERS['CLIENT']: self.process_client_message(message) elif message_type == PROTOCOL_HEADERS['WORKER']: self.process_worker_message(message) except: traceback.print_exc() def process_queue(self, data, qid): task_data = pickle.loads(data) # Analyze the given DAG. self.resource_manager.initialize_dag(task_data) self.task_queue_lock.acquire() self.pending_task_queue_lock.acquire() for task in task_data: self.process_queue_task(task, qid) self.pending_task_queue_lock.release() self.task_queue_lock.release() def process_queue_task(self, task_data, qid): self.task_id = self.task_id + 1 splited_inputs = task_data['inputs'].split() total_inputs = len(splited_inputs) current_inputs = 0 for inputf in splited_inputs: if os.path.isfile(inputf): current_inputs = current_inputs + 1 try: self.file_to_task[hashlib.sha1(inputf.encode()).hexdigest()]['tids'].append(self.task_id) self.file_to_task[hashlib.sha1(inputf.encode()).hexdigest()]['ntids'] = self.file_to_task[hashlib.sha1(inputf.encode()).hexdigest()]['ntids'] + 1 except: self.file_to_task[ hashlib.sha1( inputf.encode()).hexdigest()] = { 'ntids': 1, 'tids': [], 'ctids': 0} self.file_to_task[ hashlib.sha1( inputf.encode()).hexdigest()]['tids'] = [ self.task_id] self.task_to_file[self.task_id] = { 'queueid': qid, 'tinputs': total_inputs, 'cinputs': current_inputs, 'inputs': splited_inputs, 'outputs': task_data['outputs'].split()} if current_inputs == total_inputs: print "Putting task ", self.task_id, " in active queue", qid if qid not in self.task_queue: try: self.new_queues_lock.acquire() self.new_queues.append(qid) self.new_queues_lock.release() self.task_queue.append(qid) # FCFS like self.task_queue_data[qid] = deque() except: traceback.print_exc() self.task_queue_data[qid].append( {'id': self.task_id, 'exec': task_data['exec'], 'params': task_data['params']}) else: print( "Putting task %s in pending queue, total inputs: %s" % (self.task_id, total_inputs)) task_info = { 'id': self.task_id, 'exec': task_data['exec'], 'params': task_data['params'], 'inputs': splited_inputs} try: self.pending_queue_data[qid][self.task_id] = task_info except: self.pending_queue_data[qid] = {} self.pending_queue_data[qid][self.task_id] = task_info def process_status(self, qid): self.task_queue_lock.acquire() ntasks = len(self.task_queue_data[qid]) self.task_queue_lock.release() reply = [ message[0], PROTOCOL_HEADERS['RSMNG'], 'status', str(ntasks)] self.server_thread.put_request_in_queue(reply) def process_wait(self, qid, message): self.workers_lock.acquire() self.workers_emtpy = 0 self.workers_empty_dict = {} self.workers_lock.release() self.waiting_clients.append(message[0]) def process_delete(self, data): worker_id = 'sched-' + data with open(os.devnull, 'w') as devnull: proc = subprocess.Popen( "ssh %s \" pkill -9 local_resourcem \" " % data, stdout=devnull, stderr=devnull) proc.wait() # we delete all info about the worker.... self.workers_lock.acquire() self.workers.remove(worker_id) del self.workers_data[worker_id] self.workers_lock.release() def process_client_message(self, message): tmp_msg = message[4:] qid = message[1] action = tmp_msg[0] data = None if len(tmp_msg) > 1: data = tmp_msg[1] print "I receive message for queue ", qid, " action: ", action if action == 'queue': if data is None: # print "Missing task information" return self.process_queue(data, qid) elif action == 'status': self.process_status(qid) elif action == 'wait': self.process_wait(qid, message) elif action == 'clear': print "I will empty the worker queues" for worker in self.workers: self.server_thread.put_request_in_queue( [worker, PROTOCOL_HEADERS['RSMNG'], 'empty']) elif action == 'delete': self.process_delete(data) else: print "Not implemented yet" def process_worker_nonempty_queue_static(self, tasks_per_queue, answer): queue_id = self.task_queue[0] ntasks = min( len(self.task_queue_data[queue_id]), int(tasks_per_queue)) for i in range(0, ntasks): task = self.task_queue_data[queue_id].popleft() answer['tasks'].append(task) if len(self.task_queue_data[queue_id]) == 0: self.delete_queue(queue_id) return answer def process_worker_nonempty_queue_static1(self, tasks_per_queue, answer): # send only from the first queue queue_id = self.task_queue[0] if queue_id not in tasks_per_queue: ntasks = min(len(self.task_queue_data[queue_id]), int(tasks_per_queue.values()[0])) else: ntasks = min(len(self.task_queue_data[queue_id]), int(tasks_per_queue[queue_id])) for i in range(0, ntasks): task = self.task_queue_data[queue_id].popleft() answer['tasks'].append(task) if len(self.task_queue_data[queue_id]) == 0: self.delete_queue(queue_id) return answer def process_worker_nonempty_queue(self, worker_id, type, data): ''' FCFS queue for the static policy ''' randn = 0 queue_id = self.task_queue[randn] answer = {'queues': {}, 'tasks': []} if type == 'task_empty': tasks_per_queue = data answer = self.process_worker_nonempty_queue_static(tasks_per_queue, answer) else: tasks_per_queue = pickle.loads(data) answer = self.process_worker_nonempty_queue_static1(tasks_per_queue, answer) for qid in self.task_queue_data: if qid not in tasks_per_queue: answer['queues'][qid] = 0 # TODO : piggyback the ids of other queues, if no queues > 1 ''' if we don't have any data left in the queue we delete it ''' ''' if the worker sent us an empty message reset it ''' self.workers_lock.acquire() if self.workers_empty_dict.get(worker_id): del self.workers_empty_dict[worker_id] self.workers_lock.release() return answer def process_worker_message(self, message): tmp_msg = message[4:] type = tmp_msg[0] # If we are dealing with a resource or statistics message, let the resource manager handle it. if type == 'resource' or type == 'statistics': self.resource_manager.process_worker_message(message) return else: data = None first_worker = False self.workers_lock.acquire() if message[0] not in self.workers: self.workers.append(message[0]) self.workers_data[message[0]] = 0 if len(tmp_msg) > 1: data = tmp_msg[1] if len(tmp_msg) > 2: data2 = pickle.loads(tmp_msg[2]) computed_tasks = data2['ran'] worker_current_size = data2['qsize'] self.workers_data[message[0]] = float(worker_current_size) if len(computed_tasks) > 0: # Tell the resource manager that this worker has finished these tasks. self.resource_manager.finished_tasks(message[0], computed_tasks) self.result_queue.put(computed_tasks) if type == 'task_empty': self.workers_data[message[0]] = 0 self.workers_lock.release() if type == 'task' or type == 'task_empty': ''' send x tasks to it ''' ''' this is a hack; it works only with one client ''' self.task_queue_lock.acquire() task_queue_len = len(self.task_queue) self.task_queue_lock.release() self.pending_task_queue_lock.acquire() pending_task_queue_len = len(self.pending_queue_data) self.pending_task_queue_lock.release() if type == 'task_empty' and task_queue_len == 0 \ and pending_task_queue_len == 0 \ and len(self.waiting_clients) > 0: self.workers_lock.acquire() if not self.workers_empty_dict.get( message[0]) and not first_worker: self.workers_empty_dict[message[0]] = 1 self.workers_lock.release() answer = {'queues': {}, 'tasks': []} wake_client = False self.task_queue_lock.acquire() task_queue_len = len(self.task_queue) sent_tasks = False if task_queue_len > 0: sent_tasks = True answer = self.process_worker_nonempty_queue(message[0], type, data) else: self.workers_lock.acquire() if pending_task_queue_len == 0 and len( self.workers_empty_dict) >= len( self.workers): wake_client = True self.workers_lock.release() self.task_queue_lock.release() if len(answer['tasks']) > 0: data = pickle.dumps(answer) self.server_thread.put_request_in_queue( [message[0], PROTOCOL_HEADERS['RSMNG'], 'task', data]) # Inform the resource manager we have sent tasks. self.resource_manager.sent_tasks_to_worker(message[0], answer['tasks']) if wake_client: for client in self.waiting_clients: # calculate the cost self.resource_manager.calculate_cost() print "Sending wake up message to ", client self.server_thread.put_request_in_queue( [client, PROTOCOL_HEADERS['RSMNG'], 'done']) self.waiting_clients = [] self.workers_empty = 0 return if sent_tasks: return elif type == 'output': ''' un-serialize the output and append it to a log ''' output = pickle.loads(data) self.task_logger.info(output) def send_message(self, message): """ Convenience method to let the resoure manager send a message. :param message: :return: """ self.server_thread.put_request_in_queue(message) def run(self): self.server_thread.start() while self.running: try: ''' compute the number of workers and data nodes for each scheduling period ''' if self.running: time.sleep(config.WAITTIME) except KeyboardInterrupt: self.shutdown() except: traceback.print_exc(self.logger.path) try: time.sleep(config.WAITTIME) except KeyboardInterrupt: self.shutdown() print "Stopping communication thread...." sys.stdout.flush() self.logger.info("Stopping communication thread....") self.server_thread.stop() print "Joining communication thread...." sys.stdout.flush() self.logger.info("Joining communication thread....") self.server_thread.join() if config.START_MEMFS: print "Stopping MemFS" sys.stdout.flush() self.resource_manager.stop_memfs() print "DONE" self.logger.info("DONE") return def shutdown(self): print "Received signal to shutdown. " sys.stdout.flush() self.logger.info("Received signal to shutdown. Will wait for the end of the \ scheduling period") self.running = False
class Scheduler(object): stdin = "/dev/null" stdout = "/dev/null" stderr = "/dev/null" def __init__(self): self.running = True self.resource_queues = {'cpu': [], 'memory': [], 'bw': []} ''' this is to have multiple applications/queues; application = queue ''' self.task_queue = [] # ready queues; [queue_id] self.task_queue_data = {} # ready queue data; queue_id: [task...] # queues with pending tasks, which do not have the input ready self.pending_queue_data = {} self.task_queue_lock = threading.Lock() self.pending_task_queue_lock = threading.Lock() logfile = config.LOGDIR + "/scheduler.log" taskfile = config.LOGDIR + "/task.log" self.logger = WeaselLogger('scheduler', logfile) self.task_logger = WeaselLogger('tasklogger', logfile) ''' this is the thread that will perform the communication with the local schedulers ''' self.server_thread = ZmqConnectionThread( 'resourcemng', zmq.ROUTER, "*:" + str( config.ZMQ_SCHEDULER_PORT), self.msg_process_callback) ''' this information is for keeping track of files and task dependencies ''' self.task_to_file = { } # taskid: {'nfiles': 0, 'ntotalfiles':x, 'outputs':[files]} # file: {'ntids': ntids, 'tids': [tids], 'ctids': executed_tasks} self.file_to_task = {} self.result_queue = Queue() self.result_consumer_thread = threading.Thread(target=self.get_result) self.result_consumer_thread.start() ''' idle worker information and client notification ''' self.waiting_clients = [] self.workers_empty_dict = {} self.workers = [] self.workers_data = {} self.workers_lock = threading.Lock() self.workers_empty = 0 self.task_id = 0 ''' to notify workers about new queues ''' self.new_queues = [] self.new_queues_lock = threading.Lock() ''' here I have: the thread that listens for messages a queue in which the tasks are put the main thread that applies some reconfiguration (?) ''' self.files_to_delete = [] def delete_queue(self, qid): del self.task_queue_data[qid] self.task_queue.remove(qid) self.new_queues_lock.acquire() try: self.new_queues.remove(qid) except: pass self.new_queues_lock.release() def get_taskids(self): tasks_ids = [] try: tasks_ids = self.result_queue.get( block=True, timeout=8 * config.WAITTIME) except: self.pending_task_queue_lock.acquire() pending_queue = self.pending_queue_data for pqueue in pending_queue: to_delete = [] for tid in pending_queue[pqueue]: current_inputs = 0 tinputs = len(pending_queue[pqueue][tid]['inputs']) for inputf in pending_queue[pqueue][tid]['inputs']: if os.path.isfile(inputf): current_inputs = current_inputs + 1 #else: # print "Missing file ", inputf if current_inputs == tinputs: task = pending_queue[pqueue][tid] data = { 'id': tid, 'exec': task['exec'], 'params': task['params']} to_delete.append(tid) self.task_queue_lock.acquire() is_new = False try: self.task_queue_data[pqueue].append(data) except: self.task_queue.append(pqueue) self.task_queue_data[pqueue] = deque() self.task_queue_data[pqueue].append(data) is_new = True self.task_queue_lock.release() if is_new: self.new_queues_lock.acquire() self.new_queues.append(pqueue) self.new_queues_lock.release() for tid in to_delete: del self.pending_queue_data[pqueue][tid] self.pending_task_queue_lock.release() return tasks_ids def check_dependencies_per_task(self, taskid): for fileid in self.task_to_file[taskid]['outputs']: dependent_tasks = [] try: dependent_tasks = self.file_to_task[ hashlib.sha1(fileid.encode()).hexdigest()]['tids'] except: pass for taskid2 in dependent_tasks: self.task_to_file[taskid2]['cinputs'] = self.task_to_file[taskid2]['cinputs'] + 1 if self.task_to_file[taskid2]['cinputs'] == self.task_to_file[taskid2]['tinputs']: # put in ready queue self.pending_task_queue_lock.acquire() try: task = self.pending_queue_data[self.task_to_file[taskid2]['queueid']][taskid2] except: self.pending_task_queue_lock.release() continue self.pending_task_queue_lock.release() data = { 'id': taskid2, 'exec': task['exec'], 'params': task['params']} self.task_queue_lock.acquire() is_new = False try: self.task_queue_data[self.task_to_file[taskid2]['queueid']].append(data) except: self.task_queue.append(self.task_to_file[taskid2]['queueid']) # FCFS like self.task_queue_data[self.task_to_file[taskid2]['queueid']] = deque() self.task_queue_data[self.task_to_file[taskid2]['queueid']].append(data) is_new = True self.task_queue_lock.release() if is_new: self.new_queues_lock.acquire() self.new_queues.append(self.task_to_file[taskid2]['queueid']) self.new_queues_lock.release() self.pending_task_queue_lock.acquire() del self.pending_queue_data[ self.task_to_file[taskid2]['queueid']][taskid2] self.pending_task_queue_lock.release() def garbage_collect(self, taskid): for fileid in self.task_to_file[taskid]['inputs']: try: self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ctids'] = self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ctids'] + 1 if self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ctids'] == self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ntids']: # now it is safe to delete this file (is it?) try: print "deleting file ", fileid, "ctids=", \ self.file_to_task[hashlib.sha1(fileid.encode()).hexdigest()]['ctids'] \ , "ntids=", self.file_to_task[hashlib.sha1(fileid.encode()).hexdigest()]['ntids'] self.logger.debug("File: %s dependent_tasks: %s" % (fileid, self.file_to_task[ hashlib.sha1( fileid.encode()).hexdigest()]['ntids'])) # os.remove(fileid) # if failed, report it back to the user ## except OSError as e: print "Error: %s - %s." % (e.filename, e.strerror) except: print "exception for file ", fileid def get_result(self): while self.running: self.pending_task_queue_lock.acquire() to_delete = [] for pqueue in self.pending_queue_data: if len(self.pending_queue_data[pqueue]) == 0: to_delete.append(pqueue) for pqueue in to_delete: del self.pending_queue_data[pqueue] self.pending_task_queue_lock.release() tasks_ids = self.get_taskids() if len(tasks_ids) == 0: continue for taskid in tasks_ids: # if missing files were generated put task in ready queue try: self.check_dependencies_per_task(taskid) except: traceback.print_exc() ''' for taskid in tasks_ids: try: self.garbage_collect(taskid) except: print "I cannot find ", taskid, "in task_to_file" ''' def msg_process_callback(self, message): message_type = message[3] try: if message_type == PROTOCOL_HEADERS['CLIENT']: self.process_client_message(message) elif message_type == PROTOCOL_HEADERS['WORKER']: self.process_worker_message(message) except: traceback.print_exc() def process_queue(self, data, qid): task_data = pickle.loads(data) self.task_queue_lock.acquire() self.pending_task_queue_lock.acquire() for task in task_data: self.process_queue_task(task, qid) self.pending_task_queue_lock.release() self.task_queue_lock.release() def process_queue_task(self, task_data, qid): self.task_id = self.task_id + 1 splited_inputs = task_data['inputs'].split() total_inputs = len(splited_inputs) current_inputs = 0 for inputf in splited_inputs: if os.path.isfile(inputf): current_inputs = current_inputs + 1 #else: # print "Missing file: ", inputf try: self.file_to_task[ hashlib.sha1( inputf.encode()).hexdigest()]['tids'].append( self.task_id) self.file_to_task[ hashlib.sha1( inputf.encode()).hexdigest()]['ntids'] = self.file_to_task[ hashlib.sha1( inputf.encode()).hexdigest()]['ntids'] + 1 except: self.file_to_task[ hashlib.sha1( inputf.encode()).hexdigest()] = { 'ntids': 1, 'tids': [], 'ctids': 0} self.file_to_task[ hashlib.sha1( inputf.encode()).hexdigest()]['tids'] = [ self.task_id] self.task_to_file[self.task_id] = { 'queueid': qid, 'tinputs': total_inputs, 'cinputs': current_inputs, 'inputs': splited_inputs, 'outputs': task_data['outputs'].split()} if current_inputs == total_inputs: print "Putting task ", self.task_id, " in active queue", qid if qid not in self.task_queue: try: self.new_queues_lock.acquire() self.new_queues.append(qid) self.new_queues_lock.release() self.task_queue.append(qid) # FCFS like self.task_queue_data[qid] = deque() except: traceback.print_exc() self.task_queue_data[qid].append( {'id': self.task_id, 'exec': task_data['exec'], 'params': task_data['params']}) else: self.logger.info( "Putting task %s in pending queue, total inputs: %s" % (self.task_id, total_inputs)) task_info = { 'id': self.task_id, 'exec': task_data['exec'], 'params': task_data['params'], 'inputs': splited_inputs} try: self.pending_queue_data[qid][self.task_id] = task_info except: self.pending_queue_data[qid] = {} self.pending_queue_data[qid][self.task_id] = task_info def process_status(self, qid): self.task_queue_lock.acquire() ntasks = len(self.task_queue_data[qid]) self.task_queue_lock.release() reply = [ message[0], PROTOCOL_HEADERS['RSMNG'], 'status', str(ntasks)] self.server_thread.put_request_in_queue(reply) def process_wait(self, qid, message): self.workers_lock.acquire() self.workers_emtpy = 0 self.workers_empty_dict = {} self.workers_lock.release() self.waiting_clients.append(message[0]) def process_delete(self, data): worker_id = 'sched-' + data with open(os.devnull, 'w') as devnull: proc = subprocess.Popen( "ssh %s \" pkill -9 local_resourcem \" " % data, stdout=devnull, stderr=devnull) proc.wait() # we delete all info about the worker.... self.workers_lock.acquire() self.workers.remove(worker_id) del self.workers_data[worker_id] self.workers_lock.release() def process_client_message(self, message): tmp_msg = message[4:] qid = message[1] action = tmp_msg[0] data = None if len(tmp_msg) > 1: data = tmp_msg[1] print "I receive message for queue ", qid, " action: ", action if action == 'queue': if data is None: #print "Missing task information" return self.process_queue(data, qid) elif action == 'status': self.process_status(qid) elif action == 'wait': self.process_wait(qid, message) elif action == 'clear': print "I will empty the worker queues" for worker in workers: self.server_thread.put_request_in_queue( [worker, PROTOCOL_HEADERS['RSMNG'], 'empty']) elif action == 'delete': self.process_delete(data) else: print "Not implemented yet" def process_worker_nonempty_queue_static(self, tasks_per_queue, answer): queue_id = self.task_queue[0] ntasks = min( len(self.task_queue_data[queue_id]), int(tasks_per_queue)) for i in range(0, ntasks): task = self.task_queue_data[queue_id].popleft() answer['tasks'].append(task) if len(self.task_queue_data[queue_id]) == 0: self.delete_queue(queue_id) return answer def process_worker_nonempty_queue_dynamic(self, queue_id, tasks_per_queue, answer): # send from each queue print "[Empty] Asked ", queue_id, "tasks per queue ", tasks_per_queue to_delete = [] for qid in self.task_queue_data: # if the active queue len is smaller than what the worker asked # and the pending queue len is larger than that, do # not send? req_tasks = int(tasks_per_queue) self.pending_task_queue_lock.acquire() pending_qid = self.pending_queue_data.get(qid) pending_qid_len = 0 if pending_qid != None: pending_qid_len = len(pending_qid) self.pending_task_queue_lock.release() #if pending_qid: # if pending_qid_len > req_tasks and len( # self.task_queue_data[qid]) < req_tasks: # print "I have in queue less than asked and in pending queue more!" # continue ntasks = min( len(self.task_queue_data[qid]), req_tasks) answer['queues'][qid] = ntasks print "Asked for ", tasks_per_queue, "Sending ", ntasks, "from queue ", qid, \ " from max ntasks ", len(self.task_queue_data[qid]) for i in range(0, ntasks): task = self.task_queue_data[qid].popleft() answer['tasks'].append(task) if ntasks > 0 and len( self.task_queue_data[qid]) == 0: to_delete.append(qid) for qid in to_delete: self.delete_queue(qid) return answer def process_worker_nonempty_queue_static1(self, tasks_per_queue, answer): # send only from the first queue queue_id = self.task_queue[0] if queue_id not in tasks_per_queue: ntasks = min(len(self.task_queue_data[queue_id]), int(tasks_per_queue.values()[0])) else: ntasks = min(len(self.task_queue_data[queue_id]), int(tasks_per_queue[queue_id])) for i in range(0, ntasks): task = self.task_queue_data[queue_id].popleft() answer['tasks'].append(task) if len(self.task_queue_data[queue_id]) == 0: self.delete_queue(queue_id) return answer def process_worker_nonempty_queue_dynamic1(self, tasks_per_queue, answer): print "####################### Asked ", tasks_per_queue for qid in tasks_per_queue: # for each queue from which the worker asks check # if I have enough tasks to send req_tasks = int(tasks_per_queue[qid]) self.pending_task_queue_lock.acquire() if self.pending_queue_data.get(qid) and self.task_queue_data.get(qid): if len(self.task_queue_data) > 1 and len(self.pending_queue_data[qid]) > req_tasks and len( self.task_queue_data[qid]) < req_tasks: self.pending_task_queue_lock.release() continue self.pending_task_queue_lock.release() try: ntasks = min(len(self.task_queue_data[qid]), req_tasks) except: answer['queues'][qid] = -1 ntasks = -1 continue answer['queues'][qid] = ntasks if ntasks > 0: print "Sending ", ntasks, " from queue ", qid for i in range(0, ntasks): task = self.task_queue_data[qid].popleft() answer['tasks'].append(task) if len(self.task_queue_data[qid]) == 0: self.delete_queue(qid) to_delete = [] return answer def process_worker_nonempty_queue(self, worker_id, type, data): ''' FCFS queue for the static policy ''' randn = 0 queue_id = self.task_queue[randn] answer = {'queues': {}, 'tasks': []} if type == 'task_empty': tasks_per_queue = data if config.POLICY == 'static': # send from the first queue answer = self.process_worker_nonempty_queue_static(tasks_per_queue, answer) else: answer =self.process_worker_nonempty_queue_dynamic(queue_id, tasks_per_queue, answer) else: tasks_per_queue = pickle.loads(data) print tasks_per_queue if config.POLICY == "static": answer = self.process_worker_nonempty_queue_static1(tasks_per_queue, answer) else: answer = self.process_worker_nonempty_queue_dynamic1(tasks_per_queue, answer) for qid in self.task_queue_data: if qid not in tasks_per_queue: answer['queues'][qid] = 0 print "Sending ", len(answer['tasks']), answer['queues'], " to worker ", worker_id # TODO : piggyback the ids of other queues, if no queues > 1 ''' if we don't have any data left in the queue we delete it ''' ''' if the worker sent us an empty message reset it ''' self.workers_lock.acquire() if self.workers_empty_dict.get(worker_id): del self.workers_empty_dict[worker_id] self.workers_lock.release() return answer def process_worker_message(self, message): tmp_msg = message[4:] type = tmp_msg[0] data = None first_worker = False self.workers_lock.acquire() if message[0] not in self.workers: self.workers.append(message[0]) self.workers_data[message[0]] = 0 #first_worker = True if len(tmp_msg) > 1: data = tmp_msg[1] if len(tmp_msg) > 2: data2 = pickle.loads(tmp_msg[2]) computed_tasks = data2['ran'] worker_current_size = data2['qsize'] self.workers_data[message[0]] = float(worker_current_size) if len(computed_tasks) > 0: self.result_queue.put(computed_tasks) if type == 'task_empty': self.workers_data[message[0]] = 0 self.workers_lock.release() if type == 'task' or type == 'task_empty': ''' send x tasks to it ''' ''' this is a hack; it works only with one client ''' self.task_queue_lock.acquire() task_queue_len = len(self.task_queue) self.task_queue_lock.release() self.pending_task_queue_lock.acquire() pending_task_queue_len = len(self.pending_queue_data) self.pending_task_queue_lock.release() if type == 'task_empty' and task_queue_len == 0 \ and pending_task_queue_len == 0 \ and len(self.waiting_clients) > 0: self.workers_lock.acquire() if not self.workers_empty_dict.get( message[0]) and not first_worker: self.workers_empty_dict[message[0]] = 1 self.workers_lock.release() answer = {'queues': {}, 'tasks': []} wake_client = False self.task_queue_lock.acquire() task_queue_len = len(self.task_queue) sent_tasks = False if task_queue_len > 0: sent_tasks = True answer = self.process_worker_nonempty_queue(message[0], type, data) else: print self.workers_empty_dict self.workers_lock.acquire() if pending_task_queue_len == 0 and len( self.workers_empty_dict) >= len( self.workers): wake_client = True self.workers_lock.release() self.task_queue_lock.release() if len(answer['tasks']) > 0: print "Sending ", answer, " to worker ", message[0] data = pickle.dumps(answer) self.server_thread.put_request_in_queue( [message[0], PROTOCOL_HEADERS['RSMNG'], 'task', data]) if wake_client: for client in self.waiting_clients: print "Sending wake up message to ", client self.server_thread.put_request_in_queue( [client, PROTOCOL_HEADERS['RSMNG'], 'done']) self.waiting_clients = [] self.workers_empty = 0 return if sent_tasks: return ''' If my queues are empty, we apply the work stealing algorithm.... ''' elif type == 'output': ''' un-serialize the output and append it to a log ''' output = pickle.loads(data) self.task_logger.info(output) ''' Main scheduling LOOP ''' def run(self): self.server_thread.start() while self.running: try: self.logger.debug("Scheduling.......") ''' compute the number of workers and data nodes for each scheduling period ''' self.task_queue_lock.acquire() print "*** Task queue is: ", self.task_queue for aqueue in self.task_queue: print "Queue ", aqueue, "len ", len(self.task_queue_data[aqueue]) self.task_queue_lock.release() self.pending_task_queue_lock.acquire() print "Task pending queue is: ", self.pending_queue_data.keys() for pqueue in self.pending_queue_data: print "Queue ", pqueue, "len ", len(self.pending_queue_data[pqueue]) self.pending_task_queue_lock.release() if self.running: time.sleep(config.WAITTIME) except KeyboardInterrupt: self.shutdown() except: traceback.print_exc(self.logger.path) try: time.sleep(config.WAITTIME) except KeyboardInterrupt: self.shutdown() print "Stopping communication thread...." self.logger.info("Stopping communication thread....") self.server_thread.stop() print "Joining communication thread...." self.logger.info("Joining communication thread....") self.server_thread.join() print "DONE" self.logger.info("DONE") return def shutdown(self): print "Received signal to shutdown. " self.logger.info("Received signal to shutdown. Will wait for the end of the \ scheduling period") self.running = False
class NodeScheduler(object): def __init__(self): self.identity = 'sched-' + socket.gethostbyname(socket.gethostname()) self.sched_client_thread = ZmqConnectionThread( self.identity, zmq.DEALER, config.SCHEDULER+":" + str(config.ZMQ_SCHEDULER_PORT), self.callback) self.monitor_thread = NodeMonitor() self.running = True logfile = config.LOGDIR + "/local_scheduler.log" self.logger = WeaselLogger('local_scheduler', logfile) self.capacity = self.monitor_thread.capacity self.max_tasks_to_run = {} ''' the starting number of tasks is defined based on the slot size ''' self.ntasks_to_ask = 1 self.task_id = 1 self.time_asked_first = time.time() self.time_from_last_ask = -1 ''' this is to keep track of number of running tasks ? ''' self.running_task = 0 self.nran_tasks = [] self.time_from_last_ask = time.time() self.queues_asked_for = [] self.current_ntasks = 1 self.has_new_task = False self.is_profiling = False self.first_task = False self.task_data = {} self.t_avg = {} self.task_data_lock = threading.Lock() self.running_task_lock = threading.Lock() self.average_utilization = {'cpu': 0.0, 'memory': 0.0, 'network': 0.0} self.average_task_exec_time = 0.0 self.sleep_time = config.WAITTIME self.past_speed_changes = [] # 'id': id, 'tpool': threadPool, 'rvector': resource_characteristics self.queue_data = {} self.task_time = 1 self.queue_data_lock = threading.Lock() self.has_new_queue = False self.new_queues = [] self.message_to_send = None ''' this is to control how many tasks to run in parallel''' self.logger.info("NodeScheduler started...") self.nrunning_past_period = [] def profile(self, nrunning): pass def change_work_queue(self, nrunning, nrunning_past, avg_time, avg_cpu): pass def run_task(self, arg): command_id = arg['id'] command = arg['exec'] +' ' + arg['params'] qid = arg['qid'] myid = threading.current_thread().ident self.running_task_lock.acquire() self.running_task = self.running_task + 1 self.running_task_lock.release() ''' this also marks that at least one task runs on the node ... ''' ''' here I need to put it in the queue of tasks that the monitor will watch over ''' memory_average = 0.0 cpu_average = 0.0 nreads = 0 nwrites = 0 nbytesread = 0 nbyteswritten = 0 time_intervals = 0 start_time = time.time() proc = psutil.Popen(command, shell=True, stdout=PIPE, stderr=PIPE) self.task_data[myid]['lock'].acquire() self.task_data[myid]['proc'] = proc self.task_data[myid]['ctask'] = arg self.task_data[myid]['lock'].release() out, err = proc.communicate() end_time = time.time() self.task_data[myid]['lock'].acquire() if self.task_data[myid]['task'].get(qid) == None: self.task_data[myid]['task'][qid] = [] self.external_change = True self.task_data[myid]['task'][qid].append( [end_time - start_time, 100 * (end_time - start_time) / (end_time - start_time)]) self.task_data[myid]['lock'].release() self.running_task_lock.acquire() self.running_task = self.running_task - 1 self.nran_tasks.append(command_id) self.running_task_lock.release() def get_total_queue_size(self): queue_size = 0 self.queue_data_lock.acquire() for qid in self.queue_data: #print "Queue ", qid, " size ", self.queue_data[qid]['tpool'].tasks.qsize() queue_size = queue_size + \ self.queue_data[qid]['tpool'].tasks.qsize() self.queue_data_lock.release() return queue_size def get_tasks_to_ask(self, nrunning): tasks_to_ask = {} self.queues_asked_for = [] queue_size = self.get_total_queue_size() if queue_size + nrunning == 0 and not self.is_profiling: return (tasks_to_ask, queue_size) self.queue_data_lock.acquire() for qid in self.queue_data: tasks_to_ask[qid] = 0 self.queue_data[qid]['asked'] = 0 self.queue_data[qid]['recv'] = 0 qsize = self.queue_data[qid]['tpool'].tasks.qsize() if qsize > 2 * self.max_tasks_to_run[qid] and self.max_tasks_to_run[qid] != -1: continue if qsize == 0: tasks_to_ask[qid] = max(10, 2 * self.max_tasks_to_run[qid]) else: if qsize > 2 * self.max_tasks_to_run[qid] and self.max_tasks_to_run[qid] != -1: continue elif qsize < 2 * self.max_tasks_to_run[qid]: tasks_to_ask[qid] = 2 self.queues_asked_for.append(qid) self.queue_data[qid]['asked'] = tasks_to_ask[qid] self.queue_data_lock.release() return (tasks_to_ask, queue_size) def wait_and_ask(self): while self.running: # check at 0.2 seconds time.sleep(0.2) # how much time is passed from the last time we asked the rmng ctime = time.time() if ctime - self.time_from_last_ask > 2 * config.WAITTIME: # here we mark the queues as dead for qid in self.queues_asked_for: if self.queue_data[qid]['tpool'].tasks.qsize() == 0: print "@@@@@@@@@@@@@@@@@@@ I mark queue ", qid, " as dead because I don't have tasks for it" self.max_tasks_to_run[qid] = -1 self.running_task_lock.acquire() nrunning = self.running_task self.nrunning_past_period.append(nrunning) task_data_to_send = {'ran': self.nran_tasks[:]} self.nran_tasks = [] self.running_task_lock.release() resources = {'cpu': 0, 'memory': 0, 'network': 0} if self.is_profiling: self.profile(nrunning) (tasks_to_ask, queue_size) = self.get_tasks_to_ask(nrunning) #print "Asking for tasks: ", tasks_to_ask, queue_size task_data_to_send['qsize'] = queue_size * self.task_time pickled_data = pickle.dumps(task_data_to_send) if self.is_profiling and config.POLICY == 'dynamic3': if qsize + nrunning == 0 and not self.first_task: self.sched_client_thread.put_request_in_queue( [self.identity, PROTOCOL_HEADERS['WORKER'], 'task_empty', str(2 * self.current_ntasks), pickled_data]) self.first_task = True continue elif len(tasks_to_ask) > 0: self.sched_client_thread.put_request_in_queue( [self.identity, PROTOCOL_HEADERS['WORKER'], 'task', pickle.dumps(tasks_to_ask), pickled_data]) self.message_to_send = pickled_data def process_task(self, task): tmp = task.split(';') task_name = tmp[-1].split()[0].split('/')[-1] new_task = False # I have new tasks!! if task_name not in self.monitor_thread.tasks_to_monitor: new_task = True self.is_profiling = True self.monitor_thread.add_task_to_monitor(task_name) return new_task def add_task_to_queues(self, tasks): if len(tasks['queues']) > 0: print tasks['queues'] self.queue_data_lock.acquire() for queue in tasks['queues']: if not self.queue_data.get(queue): self.new_queues.append(queue) self.max_tasks_to_run[queue] = 0 self.queue_data[queue] = { 'qid': queue, 'elapsed':0, 'tavg': 0, 'thoughput':0, 'asked': 0, 'recv': 0, 'type': "", 'tpool': ThreadPool( 0, self.task_data), 'resource': ""} # this contains the resource vector of a task self.has_new_queue = True if tasks['queues'][queue] == -1: print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Queue ", queue, " is empty!" self.max_tasks_to_run[queue] = -1 self.queue_data_lock.release() self.queue_data_lock.acquire() for task in tasks['tasks']: #print "Adding tasks to queues: ", task ''' here: if the task does not exist in my history: shrink the pool at 1 task and enter the profiling mode profiling mode = record resource util for the first 10 tasks ''' qid = hashlib.sha1(task['exec'].encode()).hexdigest() self.has_new_task = self.has_new_task | self.process_task( task['exec']) task['qid'] = qid self.add_task_to_queue(self.queue_data[qid]['tpool'], task) self.queue_data[qid]['recv'] = self.queue_data[qid]['recv'] + 1 self.queue_data_lock.release() def callback(self, frames): ''' this is a message from the server ''' command = frames[2] data = None if len(frames) > 3: data = frames[3] if command == 'shutdown': self.shutdown(None) elif command == 'task': self.time_from_last_ask = time.time() tasks = pickle.loads(data) self.add_task_to_queues(tasks) elif command == 'empty': for qid in self.queue_data: self.empty_queue(self.queue_data[qid]['tpool']) else: print "No callback for this message!" def add_task_to_queue(self, queue, task): queue.add_task(self.run_task, task) def empty_queue(self, queue): while not queue.empty(): try: queue.get(False) except Empty: continue queue.task_done() def shutdown(self, data): self.running = False def log_node_utilization(self): median = self.monitor_thread.get_median_utilization() histo_util = self.monitor_thread.get_utilization_by_histogram() data = self.monitor_thread.get_data() cpu_sum = median['cpu'] + median['cpu_idle'] + \ median['cpu_sys'] + median['cpu_io'] if cpu_sum == 0: real_value = 0 else: real_value = 100 * (median['cpu']) / cpu_sum self.logger.info( "Median utilization/2secs: %s %s %s %s" % (median['cpu'], median['memory'], median['network'], 100 * median['cpu_io'] / cpu_sum)) self.logger.info( "Histo utilization: %s %s %s" % (histo_util['cpu'], histo_util['memory'], histo_util['network'])) def is_ok_to_ask(self): return True def empty_task_data(self): for tid in self.task_data: self.task_data[tid]['lock'].acquire() self.task_data[tid]['task'] = {} self.task_data[tid]['lock'].release() def check_empty_queues(self): queues_empty = True self.queue_data_lock.acquire() for qid in self.queue_data: queues_empty = queues_empty & self.queue_data[ qid]['tpool'].tasks.empty() self.queue_data_lock.release() return queues_empty def compute_stats(self, task_data, avg_time, avg_cpu): total_len = 0 try: for tid in task_data: task_data[tid]['lock'].acquire() for task in task_data[tid]['task']: if not avg_time.get(task): avg_time[task] = 0 avg_cpu[task] = 0 for data in task_data[tid]['task'][task]: avg_time[task] = avg_time[task] + data[0] avg_cpu[task] = avg_cpu[task] + data[1] total_len = total_len + \ len(task_data[tid]['task'][task]) if not self.is_profiling: # leave the last value while len(task_data[tid]['task'][task]) > 0: task_data[tid]['task'][task].pop(0) task_data[tid]['lock'].release() for task in avg_time: avg_time[task] = avg_time[task] / total_len avg_cpu[task] = avg_cpu[task] / total_len self.empty_task_data() except: traceback.print_exc() def run(self): self.sched_client_thread.start() self.monitor_thread.start() finishing_tasks_thread = Thread(target=self.wait_and_ask) finishing_tasks_thread.start() ''' I have: - the monitoring thread - the communication thread - the thread that waits to ask for more tasks ''' while self.running: ''' if queue is empty and no other tasks are running: ask for task to the scheduler ''' ''' else if tasks are running check the utilization and ask for more/less ''' self.log_node_utilization() task_data = self.monitor_thread.get_task_data() total_util = {'cpu': 0, 'memory': 0} for task in task_data: for data in task_data[task]: total_util['cpu'] = total_util[ 'cpu'] + data[0][0] / len(task_data[task]) total_util['memory'] = total_util[ 'memory'] + data[1] / len(task_data[task]) self.logger.info( "Total utilization of the other processes is: %s %s" % (total_util['cpu'], total_util['memory'])) # count the total number of slots self.running_task_lock.acquire() nrunning = self.running_task nrunning_past = self.nrunning_past_period[:] self.nrunning_past_period = [] self.running_task_lock.release() for task in self.max_tasks_to_run: self.logger.info( "%s Running tasks: %s" % (task, self.max_tasks_to_run[task])) if self.check_empty_queues() and nrunning == 0: if self.is_ok_to_ask(): ''' I have finished all my tasks, ask for random task from the resource mng ''' print "Sending task_empty message!" self.sched_client_thread.put_request_in_queue( [self.identity, PROTOCOL_HEADERS['WORKER'], 'task_empty', str(2 * self.capacity['cores'])]) avg_time = {} avg_cpu = {} task_data = self.task_data self.compute_stats(task_data, avg_time, avg_cpu) # now is the time to remove the data from the dead threads self.queue_data_lock.acquire() for qid in self.queue_data: self.queue_data[qid]['tpool'].dict_lock.acquire() for tid in self.queue_data[qid]['tpool'].deleted_workers: if self.task_data.get(tid): del self.task_data[tid] self.queue_data[qid]['tpool'].deleted_workers = [] self.queue_data[qid]['tpool'].dict_lock.release() taskid = 0 max_task_time = 0 for task in avg_time: if config.POLICY != 'static': if avg_time[task] == 0: self.queue_data[task]['elapsed'] = self.queue_data[task]['elapsed'] + config.WAITTIME else: self.queue_data[task]['elapsed'] = 0 self.queue_data[task]['tavg'] = (self.queue_data[task]['tavg'] + avg_time[task])/2 if avg_time[task] > max_task_time: max_task_time = avg_time[task] print task, "Avg_time: ", avg_time[task] self.logger.info( "%s Avg_time: %s" % (task, avg_time[task])) self.logger.info( "%s Task speed: %s" % (task, self.max_tasks_to_run[task] / avg_time[task])) self.logger.info( "%s Task util: %s" % (task, avg_cpu[task])) self.past_speed_changes.append( self.max_tasks_to_run[task] / avg_time[task]) if len(self.past_speed_changes) > 4: self.past_speed_changes.pop(0) taskid = taskid + 1 if config.POLICY != 'static' and self.queue_data[task]['type'] == "": if self.queue_data[task]['elapsed'] > config.T_LONG or \ self.queue_data[task]['tavg'] > config.T_LONG: self.queue_data[task]['type'] = 'long' if self.queue_data[task]['tavg'] < config.T_LONG: self.queue_data[task]['type'] = 'short' self.queue_data_lock.release() max_avg_time = 0 self.task_time = max_task_time self.change_work_queue(nrunning, nrunning_past, avg_time, avg_cpu) self.logger.info("Ran %s tasks" % self.task_id) self.logger.debug("Sleeping: %s" % self.sleep_time) self.monitor_thread.max_data_buffer_len = int( self.sleep_time / config.MONITOR_PERIOD) time.sleep(self.sleep_time) finishing_tasks_thread.join() for qid in self.queue_data: self.queue_data[qid]['tpool'].wait_completion() self.sched_client_thread.stop() self.monitor_thread.shutdown() self.monitor_thread.join() self.sched_client_thread.join()