Esempio n. 1
0
class Scheduler(object):
    stdin = "/dev/null"
    stdout = "/dev/null"
    stderr = "/dev/null"

    def __init__(self):
        self.running = True
        self.resource_queues = {'cpu': [],
                                'memory': [],
                                'bw': []}
        ''' this is to have multiple applications/queues; application = queue '''
        self.task_queue = []  # ready queues; [queue_id]
        self.task_queue_data = {}  # ready queue data; queue_id: [task...]
        # queues with pending tasks, which do not have the input ready
        self.pending_queue_data = {}
        self.task_queue_lock = threading.Lock()
        self.pending_task_queue_lock = threading.Lock()
        logfile = config.LOGDIR + "/scheduler.log"
        taskfile = config.LOGDIR + "/task.log"
        self.logger = WeaselLogger('scheduler', logfile)
        self.task_logger = WeaselLogger('tasklogger', logfile)
        ''' this is the thread that will perform the communication with the local schedulers '''
        self.server_thread = ZmqConnectionThread(
            'resourcemng',
            zmq.ROUTER,
            "*:" + str(
                config.ZMQ_SCHEDULER_PORT),
            self.msg_process_callback)
        ''' this information is for keeping track of files and task dependencies '''
        self.task_to_file = {
        }  # taskid: {'nfiles': 0, 'ntotalfiles':x, 'outputs':[files]}
        # file: {'ntids': ntids, 'tids': [tids], 'ctids': executed_tasks}
        self.file_to_task = {}
        self.result_queue = Queue()
        self.result_consumer_thread = threading.Thread(target=self.get_result)
        self.result_consumer_thread.start()
        ''' idle worker information and client notification '''
        self.waiting_clients = []
        self.workers_empty_dict = {}
        self.workers = []
        self.workers_data = {}
	self.workers_lock = threading.Lock()
        self.workers_empty = 0
        self.task_id = 0
        ''' to notify workers about new queues '''
        self.new_queues = []
        self.new_queues_lock = threading.Lock()
        ''' here I have: the thread that listens for messages
            a queue in which the tasks are put
            the main thread that applies some reconfiguration (?)
        '''
        self.files_to_delete = []

    def delete_queue(self, qid):
        del self.task_queue_data[qid]
        self.task_queue.remove(qid)
        self.new_queues_lock.acquire()
        try:
            self.new_queues.remove(qid)
        except:
            pass
        self.new_queues_lock.release()
        
        
    def get_taskids(self):
        tasks_ids = []
        try:
            tasks_ids = self.result_queue.get(
                    block=True,
                    timeout=8 *
                    config.WAITTIME)
        except:
            self.pending_task_queue_lock.acquire()
            pending_queue = self.pending_queue_data
            for pqueue in pending_queue:
                to_delete = []
                for tid in pending_queue[pqueue]:
                    current_inputs = 0
                    tinputs = len(pending_queue[pqueue][tid]['inputs'])
                    for inputf in pending_queue[pqueue][tid]['inputs']:
                        if os.path.isfile(inputf):
                            current_inputs = current_inputs + 1
                        #else:
                        #    print "Missing file ", inputf
                    if current_inputs == tinputs:
                        task = pending_queue[pqueue][tid]
                        data = {
                                'id': tid,
                                'exec': task['exec'],
                                'params': task['params']}
                        to_delete.append(tid)
                        self.task_queue_lock.acquire()
                        is_new = False
                        try:
                            self.task_queue_data[pqueue].append(data)
                        except:
                            self.task_queue.append(pqueue)
                            self.task_queue_data[pqueue] = deque()
                            self.task_queue_data[pqueue].append(data)
                            is_new = True
                        self.task_queue_lock.release()
                        if is_new:
                            self.new_queues_lock.acquire()
                            self.new_queues.append(pqueue)
                            self.new_queues_lock.release()
                for tid in to_delete:
                    del self.pending_queue_data[pqueue][tid]
            self.pending_task_queue_lock.release()
        return tasks_ids
    
    def check_dependencies_per_task(self, taskid):
        for fileid in self.task_to_file[taskid]['outputs']:
            dependent_tasks = []
            try:
                dependent_tasks = self.file_to_task[
                            hashlib.sha1(fileid.encode()).hexdigest()]['tids']
            except:
                pass
            for taskid2 in dependent_tasks:
                self.task_to_file[taskid2]['cinputs'] = self.task_to_file[taskid2]['cinputs'] + 1
                if self.task_to_file[taskid2]['cinputs'] == self.task_to_file[taskid2]['tinputs']:
                    # put in ready queue
                    self.pending_task_queue_lock.acquire()
		    try:
                        task = self.pending_queue_data[self.task_to_file[taskid2]['queueid']][taskid2]
                    except:
			self.pending_task_queue_lock.release()
                        continue
		    self.pending_task_queue_lock.release()
                    data = {
                            'id': taskid2,
                            'exec': task['exec'],
                            'params': task['params']}
                    self.task_queue_lock.acquire()
                    is_new = False
                    try:
                        self.task_queue_data[self.task_to_file[taskid2]['queueid']].append(data)
                    except:
                        self.task_queue.append(self.task_to_file[taskid2]['queueid'])  # FCFS like
                        self.task_queue_data[self.task_to_file[taskid2]['queueid']] = deque()
                        self.task_queue_data[self.task_to_file[taskid2]['queueid']].append(data)
                        is_new = True
                    self.task_queue_lock.release()
                    if is_new:
                        self.new_queues_lock.acquire()
                        self.new_queues.append(self.task_to_file[taskid2]['queueid'])
                        self.new_queues_lock.release()
		    self.pending_task_queue_lock.acquire()
                    del self.pending_queue_data[
                                self.task_to_file[taskid2]['queueid']][taskid2]
                    self.pending_task_queue_lock.release()

    def garbage_collect(self, taskid):
        for fileid in self.task_to_file[taskid]['inputs']:
            try:
                self.file_to_task[
                                hashlib.sha1(
                                    fileid.encode()).hexdigest()]['ctids'] = self.file_to_task[
                                hashlib.sha1(
                                    fileid.encode()).hexdigest()]['ctids'] + 1
                if self.file_to_task[
                                hashlib.sha1(
                                    fileid.encode()).hexdigest()]['ctids'] == self.file_to_task[
                                hashlib.sha1(
                                    fileid.encode()).hexdigest()]['ntids']:
                                # now it is safe to delete this file (is it?)
                    try:
                        print "deleting file ", fileid, "ctids=", \
                            self.file_to_task[hashlib.sha1(fileid.encode()).hexdigest()]['ctids'] \
                            , "ntids=", self.file_to_task[hashlib.sha1(fileid.encode()).hexdigest()]['ntids']
                        self.logger.debug("File: %s dependent_tasks: %s" %
                                      (fileid, self.file_to_task[
                                        hashlib.sha1(
                                        fileid.encode()).hexdigest()]['ntids']))
                                    # os.remove(fileid)
                                # if failed, report it back to the user ##
                    except OSError as e:
                        print "Error: %s - %s." % (e.filename, e.strerror)
            except:
                print "exception for file ", fileid

    def get_result(self):
        while self.running:
            self.pending_task_queue_lock.acquire()
            to_delete = []
            for pqueue in self.pending_queue_data:
                if len(self.pending_queue_data[pqueue]) == 0:
                    to_delete.append(pqueue)
            for pqueue in to_delete:
                del self.pending_queue_data[pqueue]
            self.pending_task_queue_lock.release()
            tasks_ids = self.get_taskids()
            if len(tasks_ids) == 0:
                continue
            for taskid in tasks_ids:
                # if missing files were generated put task in ready queue
		try:
		    self.check_dependencies_per_task(taskid)
		except:
		    traceback.print_exc()
	'''
            for taskid in tasks_ids:
                try:
                    self.garbage_collect(taskid)
                except:
                    print "I cannot find ", taskid, "in task_to_file"
	'''

    def msg_process_callback(self, message):
        message_type = message[3]
        try:
	    if message_type == PROTOCOL_HEADERS['CLIENT']:
                self.process_client_message(message)
            elif message_type == PROTOCOL_HEADERS['WORKER']:
                self.process_worker_message(message)
	except:
	    traceback.print_exc()


    def process_queue(self, data, qid):
	task_data = pickle.loads(data)
	self.task_queue_lock.acquire()
        self.pending_task_queue_lock.acquire()
	for task in task_data:
	    self.process_queue_task(task, qid) 
	self.pending_task_queue_lock.release()
        self.task_queue_lock.release() 


    def process_queue_task(self, task_data, qid):
        self.task_id = self.task_id + 1
        splited_inputs = task_data['inputs'].split()
        total_inputs = len(splited_inputs)
        current_inputs = 0
        for inputf in splited_inputs:
            if os.path.isfile(inputf):
                current_inputs = current_inputs + 1
            #else:
            #    print "Missing file: ", inputf
            try:
                self.file_to_task[
                        hashlib.sha1(
                            inputf.encode()).hexdigest()]['tids'].append(
                        self.task_id)
                self.file_to_task[
                        hashlib.sha1(
                            inputf.encode()).hexdigest()]['ntids'] = self.file_to_task[
                        hashlib.sha1(
                            inputf.encode()).hexdigest()]['ntids'] + 1
            except:
                self.file_to_task[
                        hashlib.sha1(
                            inputf.encode()).hexdigest()] = {
                        'ntids': 1,
                        'tids': [],
                        'ctids': 0}
                self.file_to_task[
                        hashlib.sha1(
                            inputf.encode()).hexdigest()]['tids'] = [
                        self.task_id]
        self.task_to_file[self.task_id] = {
                'queueid': qid,
                'tinputs': total_inputs,
                'cinputs': current_inputs,
                'inputs': splited_inputs,
                'outputs': task_data['outputs'].split()}
        if current_inputs == total_inputs:
            print "Putting task ", self.task_id, " in active queue", qid
            if qid not in self.task_queue:
                try:
                    self.new_queues_lock.acquire()
                    self.new_queues.append(qid)
                    self.new_queues_lock.release()
                    self.task_queue.append(qid)  # FCFS like
                    self.task_queue_data[qid] = deque()
                except:
                    traceback.print_exc()
            self.task_queue_data[qid].append(
                    {'id': self.task_id, 'exec': task_data['exec'], 'params': task_data['params']})
        else:
            self.logger.info(
                    "Putting task %s in pending queue, total inputs: %s" %
                    (self.task_id, total_inputs))
            task_info = {
                    'id': self.task_id,
                    'exec': task_data['exec'],
                    'params': task_data['params'],
                    'inputs': splited_inputs}
            try:
                self.pending_queue_data[qid][self.task_id] = task_info
            except:
                self.pending_queue_data[qid] = {}
                self.pending_queue_data[qid][self.task_id] = task_info
 
    def process_status(self, qid):
        self.task_queue_lock.acquire()
        ntasks = len(self.task_queue_data[qid])
        self.task_queue_lock.release()
        reply = [
                message[0],
                PROTOCOL_HEADERS['RSMNG'],
                'status',
                str(ntasks)]
        self.server_thread.put_request_in_queue(reply)

        
    def process_wait(self, qid, message):
        self.workers_lock.acquire()
	self.workers_emtpy = 0
        self.workers_empty_dict = {}
	self.workers_lock.release()
        self.waiting_clients.append(message[0])
        
    def process_delete(self, data):
        worker_id = 'sched-' + data
        with open(os.devnull, 'w') as devnull:
        	proc = subprocess.Popen(
                    "ssh %s \" pkill -9 local_resourcem \" " %
                    data,
                    stdout=devnull,
                    stderr=devnull)
        	proc.wait()
        # we delete all info about the worker....
        self.workers_lock.acquire()
        self.workers.remove(worker_id)
        del self.workers_data[worker_id]
	self.workers_lock.release()

    def process_client_message(self, message):
        tmp_msg = message[4:]
        qid = message[1]
        action = tmp_msg[0]
        data = None
        if len(tmp_msg) > 1:
            data = tmp_msg[1]
	print "I receive message for queue ", qid, " action: ", action
        if action == 'queue':
            if data is None:
                #print "Missing task information"
                return
            self.process_queue(data, qid)
        elif action == 'status':
            self.process_status(qid)
        elif action == 'wait':
            self.process_wait(qid, message)
        elif action == 'clear':
            print "I will empty the worker queues"
            for worker in workers:
                self.server_thread.put_request_in_queue(
                    [worker, PROTOCOL_HEADERS['RSMNG'], 'empty'])
        elif action == 'delete':
            self.process_delete(data)
        else:
            print "Not implemented yet"


    def process_worker_nonempty_queue_static(self, tasks_per_queue, answer):
        queue_id = self.task_queue[0]
	ntasks = min(
                len(self.task_queue_data[queue_id]), int(tasks_per_queue))
        for i in range(0, ntasks):
            task = self.task_queue_data[queue_id].popleft()
            answer['tasks'].append(task)
        if len(self.task_queue_data[queue_id]) == 0:
            self.delete_queue(queue_id)
	return answer
                
    def process_worker_nonempty_queue_dynamic(self, queue_id, tasks_per_queue, answer):
        # send from each queue
	print "[Empty] Asked ", queue_id, "tasks per queue ", tasks_per_queue
        to_delete = []
	for qid in self.task_queue_data:
            # if the active queue len is smaller than what the worker asked
            # and the pending queue len is larger than that, do
            # not send?
            req_tasks = int(tasks_per_queue)
	    self.pending_task_queue_lock.acquire()
	    pending_qid = self.pending_queue_data.get(qid)
	    pending_qid_len = 0
	    if pending_qid != None:
	        pending_qid_len = len(pending_qid)
	    self.pending_task_queue_lock.release()
            #if pending_qid:
            #    if pending_qid_len > req_tasks and len(
            #        self.task_queue_data[qid]) < req_tasks:
	    #	    print "I have in queue less than asked and in pending queue more!"
            #        continue
            ntasks = min(
                        len(self.task_queue_data[qid]), req_tasks)
            answer['queues'][qid] = ntasks
            print "Asked for ", tasks_per_queue, "Sending ", ntasks, "from queue ", qid, \
                                " from max ntasks ", len(self.task_queue_data[qid])
            for i in range(0, ntasks):
                task = self.task_queue_data[qid].popleft()
                answer['tasks'].append(task)
            if ntasks > 0 and len(
                self.task_queue_data[qid]) == 0:
		to_delete.append(qid)                

	for qid in to_delete:
	    self.delete_queue(qid)
	return answer

    def process_worker_nonempty_queue_static1(self, tasks_per_queue, answer):
        # send only from the first queue
	queue_id = self.task_queue[0]
        if queue_id not in tasks_per_queue:
            ntasks = min(len(self.task_queue_data[queue_id]), int(tasks_per_queue.values()[0]))
        else:
            ntasks = min(len(self.task_queue_data[queue_id]), int(tasks_per_queue[queue_id]))
        for i in range(0, ntasks):
            task = self.task_queue_data[queue_id].popleft()
            answer['tasks'].append(task)
        if len(self.task_queue_data[queue_id]) == 0:
            self.delete_queue(queue_id)
	return answer 
                
    def process_worker_nonempty_queue_dynamic1(self, tasks_per_queue, answer):
	print "####################### Asked ", tasks_per_queue
        for qid in tasks_per_queue:
            # for each queue from which the worker asks check
            # if I have enough tasks to send
            req_tasks = int(tasks_per_queue[qid])
	    self.pending_task_queue_lock.acquire()
            if self.pending_queue_data.get(qid) and self.task_queue_data.get(qid):
                if len(self.task_queue_data) > 1 and len(self.pending_queue_data[qid]) > req_tasks and len(
                        self.task_queue_data[qid]) < req_tasks:
		    self.pending_task_queue_lock.release()
                    continue
            self.pending_task_queue_lock.release()
	    try:
                ntasks = min(len(self.task_queue_data[qid]), req_tasks)
            except:
                answer['queues'][qid] = -1
                ntasks = -1
                continue
	    answer['queues'][qid] = ntasks
            if ntasks > 0:
		print "Sending ", ntasks, " from queue ", qid
                for i in range(0, ntasks):
                    task = self.task_queue_data[qid].popleft()
                    answer['tasks'].append(task)
                if len(self.task_queue_data[qid]) == 0:
                    self.delete_queue(qid)
	to_delete = []
	return answer

    def process_worker_nonempty_queue(self, worker_id, type, data):
        ''' FCFS queue for the static policy '''
        randn = 0
        queue_id = self.task_queue[randn]
        answer = {'queues': {}, 'tasks': []}
        if type == 'task_empty':
            tasks_per_queue = data
            if config.POLICY == 'static':
                # send from the first queue
		answer = self.process_worker_nonempty_queue_static(tasks_per_queue, answer)
            else:
                answer =self.process_worker_nonempty_queue_dynamic(queue_id, tasks_per_queue, answer)
        else:
            tasks_per_queue = pickle.loads(data)
	    print tasks_per_queue
            if config.POLICY == "static":
                answer = self.process_worker_nonempty_queue_static1(tasks_per_queue, answer)
            else:
                answer = self.process_worker_nonempty_queue_dynamic1(tasks_per_queue, answer)
	    for qid in self.task_queue_data:
                if qid not in tasks_per_queue:
                    answer['queues'][qid] = 0
        print "Sending ", len(answer['tasks']), answer['queues'], " to worker ", worker_id
        # TODO : piggyback the ids of other queues, if no queues > 1
        ''' if we don't have any data left in the queue we delete it '''
        ''' if the worker sent us an empty message reset it '''
        self.workers_lock.acquire()
	if self.workers_empty_dict.get(worker_id):
            del self.workers_empty_dict[worker_id]
	self.workers_lock.release()
        return answer

    def process_worker_message(self, message):
        tmp_msg = message[4:]
        type = tmp_msg[0]
        data = None
        first_worker = False
        self.workers_lock.acquire()
        if message[0] not in self.workers:
            self.workers.append(message[0])
	    self.workers_data[message[0]] = 0
            #first_worker = True
        if len(tmp_msg) > 1:
            data = tmp_msg[1]
        if len(tmp_msg) > 2:
            data2 = pickle.loads(tmp_msg[2])
            computed_tasks = data2['ran']
	    worker_current_size = data2['qsize']
	    self.workers_data[message[0]] = float(worker_current_size)
	    if len(computed_tasks) > 0:
                self.result_queue.put(computed_tasks)
	if type == 'task_empty':
	    self.workers_data[message[0]] = 0
        self.workers_lock.release()
	if type == 'task' or type == 'task_empty':
            ''' send x tasks to it '''
            ''' this is a hack; it works only with one client '''
            self.task_queue_lock.acquire()
	    task_queue_len = len(self.task_queue)
	    self.task_queue_lock.release()
	    self.pending_task_queue_lock.acquire()
	    pending_task_queue_len = len(self.pending_queue_data)
	    self.pending_task_queue_lock.release()
	    if type == 'task_empty' and task_queue_len == 0 \
                    and pending_task_queue_len == 0 \
                    and len(self.waiting_clients) > 0:
		self.workers_lock.acquire()
                if not self.workers_empty_dict.get(
                        message[0]) and not first_worker:
                    self.workers_empty_dict[message[0]] = 1
		self.workers_lock.release()
            answer = {'queues': {}, 'tasks': []}
            wake_client = False
            self.task_queue_lock.acquire()
            task_queue_len = len(self.task_queue)
	    sent_tasks = False
	    if task_queue_len > 0:
		sent_tasks = True
                answer = self.process_worker_nonempty_queue(message[0], type, data)
            else:
		print self.workers_empty_dict
		self.workers_lock.acquire()
                if pending_task_queue_len == 0 and len(
                        self.workers_empty_dict) >= len(
                        self.workers):
                    wake_client = True
		self.workers_lock.release()
            self.task_queue_lock.release()
	    if len(answer['tasks']) > 0:
		print "Sending ", answer, " to worker ", message[0]
                data = pickle.dumps(answer)
		self.server_thread.put_request_in_queue(
                    [message[0], PROTOCOL_HEADERS['RSMNG'], 'task', data])
            if wake_client:
                for client in self.waiting_clients:
                    print "Sending wake up message to ", client
                    self.server_thread.put_request_in_queue(
                        [client, PROTOCOL_HEADERS['RSMNG'], 'done'])
                self.waiting_clients = []
                self.workers_empty = 0
		return
	    if sent_tasks:
		return
	    ''' If my queues are empty, we apply the work stealing algorithm.... '''
	elif type == 'output':
            ''' un-serialize the output and append it to a log '''
            output = pickle.loads(data)
            self.task_logger.info(output)

    ''' Main scheduling LOOP '''

    def run(self):
        self.server_thread.start()
        while self.running:
            try:
                self.logger.debug("Scheduling.......")
                ''' compute the number of workers and data nodes for each
                    scheduling period '''
		self.task_queue_lock.acquire()
                print "*** Task queue is: ", self.task_queue
                for aqueue in self.task_queue:
                    print "Queue ", aqueue, "len ", len(self.task_queue_data[aqueue])
                self.task_queue_lock.release()
		self.pending_task_queue_lock.acquire()
		print "Task pending queue is: ", self.pending_queue_data.keys()
                for pqueue in self.pending_queue_data:
                    print "Queue ", pqueue, "len ", len(self.pending_queue_data[pqueue])
                self.pending_task_queue_lock.release()
		if self.running:
                    time.sleep(config.WAITTIME)
            except KeyboardInterrupt:
                self.shutdown()
            except:
                traceback.print_exc(self.logger.path)
                try:
                    time.sleep(config.WAITTIME)
                except KeyboardInterrupt:
                    self.shutdown()
        print "Stopping communication thread...."
        self.logger.info("Stopping communication thread....")
        self.server_thread.stop()
        print "Joining communication thread...."
        self.logger.info("Joining communication thread....")
        self.server_thread.join()
        print "DONE"
        self.logger.info("DONE")
        return

    def shutdown(self):
        print "Received signal to shutdown. "
        self.logger.info("Received signal to shutdown. Will wait for the end of the \
                    scheduling period")
        self.running = False
Esempio n. 2
0
class NodeScheduler(object):

    def __init__(self):
        self.identity = 'sched-' + socket.gethostbyname(socket.gethostname())
        self.sched_client_thread = ZmqConnectionThread(
            self.identity,
            zmq.DEALER,
            config.SCHEDULER+":" + str(config.ZMQ_SCHEDULER_PORT),
            self.callback)
        self.monitor_thread = NodeMonitor()
        self.running = True
        logfile = config.LOGDIR + "/local_scheduler.log"
        self.logger = WeaselLogger('local_scheduler', logfile)
        self.capacity = self.monitor_thread.capacity
        self.max_tasks_to_run = {}
        ''' the starting number of tasks is defined based on the slot size '''
        self.ntasks_to_ask = 1
        self.task_id = 1
        self.time_asked_first = time.time()
        self.time_from_last_ask = -1
        ''' this is to keep track of number of running tasks ? '''
        self.running_task = 0
        self.nran_tasks = []
        self.time_from_last_ask = time.time()
        self.queues_asked_for = []
        self.current_ntasks = 1
        self.has_new_task = False
        self.is_profiling = False
        self.first_task = False
        self.task_data = {}
        self.t_avg = {}
        self.task_data_lock = threading.Lock()
        self.running_task_lock = threading.Lock()
        self.average_utilization = {'cpu': 0.0, 'memory': 0.0, 'network': 0.0}
        self.average_task_exec_time = 0.0
        self.sleep_time = config.WAITTIME
        self.past_speed_changes = []
        # 'id': id, 'tpool': threadPool, 'rvector': resource_characteristics
        self.queue_data = {}
	self.task_time = 1
        self.queue_data_lock = threading.Lock()
        self.has_new_queue = False
        self.new_queues = []
        self.message_to_send = None
	''' this is to control how many tasks to run in parallel'''
        self.logger.info("NodeScheduler started...")
        self.nrunning_past_period = []

    def profile(self, nrunning):
        pass

    def change_work_queue(self, nrunning, nrunning_past, avg_time, avg_cpu):
        pass

    def run_task(self, arg):
	command_id = arg['id']
        command = arg['exec'] +' ' + arg['params']
	qid = arg['qid']
	myid = threading.current_thread().ident
        self.running_task_lock.acquire()
        self.running_task = self.running_task + 1
        self.running_task_lock.release()
        ''' this also marks that at least one task runs on the node ... '''
        ''' here I need to put it in the queue of tasks that the monitor will watch over '''
        memory_average = 0.0
        cpu_average = 0.0
        nreads = 0
        nwrites = 0
        nbytesread = 0
        nbyteswritten = 0
        time_intervals = 0
        start_time = time.time()
        proc = psutil.Popen(command, shell=True,
                            stdout=PIPE, stderr=PIPE)
	self.task_data[myid]['lock'].acquire()
	self.task_data[myid]['proc'] = proc
	self.task_data[myid]['ctask'] = arg
	self.task_data[myid]['lock'].release()
        out, err = proc.communicate()
        end_time = time.time()
        self.task_data[myid]['lock'].acquire()
        if self.task_data[myid]['task'].get(qid) == None:
            self.task_data[myid]['task'][qid] = []
            self.external_change = True
        self.task_data[myid]['task'][qid].append(
            [end_time - start_time, 100 * (end_time - start_time) / (end_time - start_time)])
        self.task_data[myid]['lock'].release()
        self.running_task_lock.acquire()
        self.running_task = self.running_task - 1
        self.nran_tasks.append(command_id)
        self.running_task_lock.release()

    def get_total_queue_size(self):
        queue_size = 0
        self.queue_data_lock.acquire()
        for qid in self.queue_data:
	    #print "Queue ", qid, " size ", self.queue_data[qid]['tpool'].tasks.qsize()
            queue_size = queue_size + \
                self.queue_data[qid]['tpool'].tasks.qsize()
        self.queue_data_lock.release()
        return queue_size

    def get_tasks_to_ask(self, nrunning):
        tasks_to_ask = {}
        self.queues_asked_for = []
        queue_size = self.get_total_queue_size()
        if queue_size + nrunning == 0 and not self.is_profiling:
            return (tasks_to_ask, queue_size)
        self.queue_data_lock.acquire()
        for qid in self.queue_data:
	    tasks_to_ask[qid] = 0
            self.queue_data[qid]['asked'] = 0
            self.queue_data[qid]['recv'] = 0
            qsize = self.queue_data[qid]['tpool'].tasks.qsize()
            if qsize > 2 * self.max_tasks_to_run[qid] and self.max_tasks_to_run[qid] != -1:
                continue
            if qsize == 0:
                tasks_to_ask[qid] = max(10, 2 * self.max_tasks_to_run[qid])
            else:
                if qsize > 2 * self.max_tasks_to_run[qid] and self.max_tasks_to_run[qid] != -1:
                    continue
                elif qsize < 2 * self.max_tasks_to_run[qid]:
                    tasks_to_ask[qid] = 2
            self.queues_asked_for.append(qid)
            self.queue_data[qid]['asked'] = tasks_to_ask[qid]
        self.queue_data_lock.release()
        return (tasks_to_ask, queue_size)

    def wait_and_ask(self):
        while self.running:
            # check at 0.2 seconds
            time.sleep(0.2)
            # how much time is passed from the last time we asked the rmng
            ctime = time.time()
            if ctime - self.time_from_last_ask > 2 * config.WAITTIME:
                # here we mark the queues as dead
                for qid in self.queues_asked_for:
		    if self.queue_data[qid]['tpool'].tasks.qsize() == 0:
                        print "@@@@@@@@@@@@@@@@@@@  I mark queue ", qid, " as dead because I don't have tasks for it"
                        self.max_tasks_to_run[qid] = -1
            self.running_task_lock.acquire()
            nrunning = self.running_task
            self.nrunning_past_period.append(nrunning)
            task_data_to_send = {'ran': self.nran_tasks[:]}
            self.nran_tasks = []
            self.running_task_lock.release()
            resources = {'cpu': 0, 'memory': 0, 'network': 0}
            if self.is_profiling:
                self.profile(nrunning)
            (tasks_to_ask, queue_size) = self.get_tasks_to_ask(nrunning)
	    #print "Asking for tasks: ", tasks_to_ask, queue_size
	    task_data_to_send['qsize'] = queue_size * self.task_time
            pickled_data = pickle.dumps(task_data_to_send)
	    if self.is_profiling and config.POLICY == 'dynamic3':
                if qsize + nrunning == 0 and not self.first_task:
                    self.sched_client_thread.put_request_in_queue(
                        [self.identity, PROTOCOL_HEADERS['WORKER'], 'task_empty',
                         str(2 * self.current_ntasks), pickled_data])
                    self.first_task = True
                continue
            elif len(tasks_to_ask) > 0:
                self.sched_client_thread.put_request_in_queue(
                    [self.identity, PROTOCOL_HEADERS['WORKER'], 'task',
                     pickle.dumps(tasks_to_ask), pickled_data])
	    self.message_to_send = pickled_data

    def process_task(self, task):
        tmp = task.split(';')
        task_name = tmp[-1].split()[0].split('/')[-1]
        new_task = False
        # I have new tasks!!
        if task_name not in self.monitor_thread.tasks_to_monitor:
            new_task = True
            self.is_profiling = True
            self.monitor_thread.add_task_to_monitor(task_name)
        return new_task

    def add_task_to_queues(self, tasks):
        if len(tasks['queues']) > 0:
	    print tasks['queues']
            self.queue_data_lock.acquire()
            for queue in tasks['queues']:
                if not self.queue_data.get(queue):
                    self.new_queues.append(queue)
                    self.max_tasks_to_run[queue] = 0
                    self.queue_data[queue] = {
                        'qid': queue,
			'elapsed':0,
			'tavg': 0,
			'thoughput':0,
                        'asked': 0,
                        'recv': 0,
                        'type': "",
			'tpool': ThreadPool(
                            0,
                            self.task_data),
                        'resource': ""}  # this contains the resource vector of a task
                    self.has_new_queue = True
                if tasks['queues'][queue] == -1:
                    print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Queue ", queue, " is empty!"
                    self.max_tasks_to_run[queue] = -1
            self.queue_data_lock.release()
        self.queue_data_lock.acquire()
        for task in tasks['tasks']:
	    #print "Adding tasks to queues: ", task
            ''' here: if the task does not exist in my history:
                shrink the pool at 1 task and enter the profiling mode
                profiling mode = record resource util for the first 10 tasks '''
            qid = hashlib.sha1(task['exec'].encode()).hexdigest()
            self.has_new_task = self.has_new_task | self.process_task(
                task['exec'])
	    task['qid'] = qid
            self.add_task_to_queue(self.queue_data[qid]['tpool'], task)
            self.queue_data[qid]['recv'] = self.queue_data[qid]['recv'] + 1
        self.queue_data_lock.release()

    def callback(self, frames):
        ''' this is a message from the server '''
	command = frames[2]
        data = None
        if len(frames) > 3:
            data = frames[3]
        if command == 'shutdown':
            self.shutdown(None)
        elif command == 'task':
            self.time_from_last_ask = time.time()
            tasks = pickle.loads(data)
            self.add_task_to_queues(tasks)
	elif command == 'empty':
            for qid in self.queue_data:
                self.empty_queue(self.queue_data[qid]['tpool'])
	else:
	    print "No callback for this message!"


    def add_task_to_queue(self, queue, task):
        queue.add_task(self.run_task, task)

    def empty_queue(self, queue):
        while not queue.empty():
            try:
                queue.get(False)
            except Empty:
                continue
            queue.task_done()

    def shutdown(self, data):
        self.running = False

    def log_node_utilization(self):
        median = self.monitor_thread.get_median_utilization()
        histo_util = self.monitor_thread.get_utilization_by_histogram()
        data = self.monitor_thread.get_data()
        cpu_sum = median['cpu'] + median['cpu_idle'] + \
            median['cpu_sys'] + median['cpu_io']
        if cpu_sum == 0:
            real_value = 0
        else:
            real_value = 100 * (median['cpu']) / cpu_sum
        self.logger.info(
            "Median utilization/2secs: %s %s %s %s" %
            (median['cpu'],
             median['memory'],
                median['network'],
                100 *
                median['cpu_io'] /
                cpu_sum))
        self.logger.info(
            "Histo utilization: %s %s %s" %
            (histo_util['cpu'],
             histo_util['memory'],
             histo_util['network']))

    def is_ok_to_ask(self):
        return True

    def empty_task_data(self):
        for tid in self.task_data:
            self.task_data[tid]['lock'].acquire()
            self.task_data[tid]['task'] = {}
            self.task_data[tid]['lock'].release()

    def check_empty_queues(self):
        queues_empty = True
        self.queue_data_lock.acquire()
        for qid in self.queue_data:
            queues_empty = queues_empty & self.queue_data[
                qid]['tpool'].tasks.empty()
        self.queue_data_lock.release()
        return queues_empty


    def compute_stats(self, task_data, avg_time, avg_cpu):
        total_len = 0
        try:
            for tid in task_data:
                task_data[tid]['lock'].acquire()
                for task in task_data[tid]['task']:
                    if not avg_time.get(task):
                        avg_time[task] = 0
                        avg_cpu[task] = 0
                    for data in task_data[tid]['task'][task]:
                        avg_time[task] = avg_time[task] + data[0]
                        avg_cpu[task] = avg_cpu[task] + data[1]
                    total_len = total_len + \
                            len(task_data[tid]['task'][task])
                    if not self.is_profiling:
                        # leave the last value
                        while len(task_data[tid]['task'][task]) > 0:
                            task_data[tid]['task'][task].pop(0)
                task_data[tid]['lock'].release()
            for task in avg_time:
                avg_time[task] = avg_time[task] / total_len
                avg_cpu[task] = avg_cpu[task] / total_len
	    self.empty_task_data()
        except:
            traceback.print_exc()

    def run(self):
        self.sched_client_thread.start()
        self.monitor_thread.start()
        finishing_tasks_thread = Thread(target=self.wait_and_ask)
        finishing_tasks_thread.start()
        '''  I have: - the monitoring thread
        - the communication thread
        - the thread that waits to ask for more tasks
        '''
        while self.running:
            ''' if queue is empty and no other tasks are running: ask for task to the scheduler '''
            ''' else if tasks are running check the utilization and ask for more/less '''
            self.log_node_utilization()
            task_data = self.monitor_thread.get_task_data()
            total_util = {'cpu': 0, 'memory': 0}
            for task in task_data:
                for data in task_data[task]:
                    total_util['cpu'] = total_util[
                        'cpu'] + data[0][0] / len(task_data[task])
                    total_util['memory'] = total_util[
                        'memory'] + data[1] / len(task_data[task])
            self.logger.info(
                "Total utilization of the other processes is: %s %s" %
                (total_util['cpu'], total_util['memory']))
            # count the total number of slots
            self.running_task_lock.acquire()
            nrunning = self.running_task
            nrunning_past = self.nrunning_past_period[:]
            self.nrunning_past_period = []
            self.running_task_lock.release()
            for task in self.max_tasks_to_run:
                self.logger.info(
                    "%s Running tasks: %s" %
                    (task, self.max_tasks_to_run[task]))
            if self.check_empty_queues() and nrunning == 0:
                if self.is_ok_to_ask():
                    ''' I have finished all my tasks, ask for random task from the resource mng '''
		    print "Sending task_empty message!"
                    self.sched_client_thread.put_request_in_queue(
                            [self.identity, PROTOCOL_HEADERS['WORKER'], 'task_empty', str(2 * self.capacity['cores'])])
            avg_time = {}
            avg_cpu = {}
            task_data = self.task_data
            self.compute_stats(task_data, avg_time, avg_cpu)
            # now is the time to remove the data from the dead threads
            self.queue_data_lock.acquire()
            for qid in self.queue_data:
                self.queue_data[qid]['tpool'].dict_lock.acquire()
                for tid in self.queue_data[qid]['tpool'].deleted_workers:
                    if self.task_data.get(tid):
                        del self.task_data[tid]
                self.queue_data[qid]['tpool'].deleted_workers = []
                self.queue_data[qid]['tpool'].dict_lock.release()
            taskid = 0
	    max_task_time = 0
            for task in avg_time:
		if config.POLICY != 'static':
                    if avg_time[task] == 0:
		        self.queue_data[task]['elapsed'] = self.queue_data[task]['elapsed'] + config.WAITTIME
		    else:
		        self.queue_data[task]['elapsed'] = 0
		        self.queue_data[task]['tavg'] = (self.queue_data[task]['tavg'] + avg_time[task])/2
		if avg_time[task] > max_task_time:
                    max_task_time = avg_time[task] 
		print task, "Avg_time: ", avg_time[task]
                self.logger.info(
                        "%s Avg_time: %s" %
                        (task, avg_time[task]))
                self.logger.info(
                        "%s Task speed: %s" %
                        (task, self.max_tasks_to_run[task] / avg_time[task]))
                self.logger.info(
                        "%s Task util: %s" %
                        (task, avg_cpu[task]))
                self.past_speed_changes.append(
                        self.max_tasks_to_run[task] /
                        avg_time[task])
                if len(self.past_speed_changes) > 4:
                    self.past_speed_changes.pop(0)
                taskid = taskid + 1
	    
	        if config.POLICY != 'static' and self.queue_data[task]['type'] == "":
		    if self.queue_data[task]['elapsed'] > config.T_LONG or \
				self.queue_data[task]['tavg'] > config.T_LONG:
		        self.queue_data[task]['type'] = 'long'
		    if self.queue_data[task]['tavg'] < config.T_LONG:
		        self.queue_data[task]['type'] = 'short'
	    self.queue_data_lock.release()
            max_avg_time = 0
	    self.task_time = max_task_time
            self.change_work_queue(nrunning, nrunning_past, avg_time, avg_cpu)
            self.logger.info("Ran %s tasks" % self.task_id)
	    self.logger.debug("Sleeping: %s" % self.sleep_time)
            self.monitor_thread.max_data_buffer_len = int(
                self.sleep_time /
                config.MONITOR_PERIOD)
            time.sleep(self.sleep_time)
        finishing_tasks_thread.join()
        for qid in self.queue_data:
            self.queue_data[qid]['tpool'].wait_completion()
        self.sched_client_thread.stop()
        self.monitor_thread.shutdown()
        self.monitor_thread.join()
        self.sched_client_thread.join()