Exemple #1
0
	def __init__(self):
		self.running = True
		self.resource_manager = ResourceManager(self)
		self.resource_queues = {'cpu': [],
								'memory': [],
								'bw': []}
		''' this is to have multiple applications/queues; application = queue '''
		self.task_queue = []  # ready queues; [queue_id]
		self.task_queue_data = {}  # ready queue data; queue_id: [task...]
		# queues with pending tasks, which do not have the input ready
		self.pending_queue_data = {}
		self.task_queue_lock = threading.Lock()
		self.pending_task_queue_lock = threading.Lock()
		logfile = config.LOGDIR + "/scheduler.log"
		taskfile = config.LOGDIR + "/task.log"
		self.logger = WeaselLogger('scheduler', logfile)
		self.task_logger = WeaselLogger('tasklogger', logfile)
		''' this is the thread that will perform the communication with the local schedulers '''
		self.server_thread = ZmqConnectionThread(
			'resourcemng',
			zmq.ROUTER,
			"*:" + str(
				config.ZMQ_SCHEDULER_PORT),
			self.msg_process_callback)
		''' this information is for keeping track of files and task dependencies '''
		self.task_to_file = {
		}  # taskid: {'nfiles': 0, 'ntotalfiles':x, 'outputs':[files]}
		# file: {'ntids': ntids, 'tids': [tids], 'ctids': executed_tasks}
		self.file_to_task = {}
		self.result_queue = Queue()
		self.result_consumer_thread = threading.Thread(target=self.get_result)
		self.result_consumer_thread.start()
		''' idle worker information and client notification '''
		self.waiting_clients = []
		self.workers_empty_dict = {}
		self.workers = []
		self.workers_data = {}
		self.workers_lock = threading.Lock()
		self.workers_empty = 0
		self.task_id = 0
		''' to notify workers about new queues '''
		self.new_queues = []
		self.new_queues_lock = threading.Lock()
		''' here I have: the thread that listens for messages
			a queue in which the tasks are put
			the main thread that applies some reconfiguration (?)
		'''
		self.files_to_delete = []
Exemple #2
0
    def __init__(self):
        self.running = True
        self.resource_queues = {'cpu': [],
                                'memory': [],
                                'bw': []}
        ''' this is to have multiple applications/queues; application = queue '''
        self.task_queue = []  # ready queues; [queue_id]
        self.task_queue_data = {}  # ready queue data; queue_id: [task...]
        # queues with pending tasks, which do not have the input ready
        self.pending_queue_data = {}
        self.task_queue_lock = threading.Lock()
        self.pending_task_queue_lock = threading.Lock()
        logfile = config.LOGDIR + "/scheduler.log"
        taskfile = config.LOGDIR + "/task.log"
        self.logger = WeaselLogger('scheduler', logfile)
        self.task_logger = WeaselLogger('tasklogger', logfile)
        ''' this is the thread that will perform the communication with the local schedulers '''
        self.server_thread = ZmqConnectionThread(
            'resourcemng',
            zmq.ROUTER,
            "*:" + str(
                config.ZMQ_SCHEDULER_PORT),
            self.msg_process_callback)
        ''' this information is for keeping track of files and task dependencies '''
        self.task_to_file = {
        }  # taskid: {'nfiles': 0, 'ntotalfiles':x, 'outputs':[files]}
        # file: {'ntids': ntids, 'tids': [tids], 'ctids': executed_tasks}
        self.file_to_task = {}
        self.result_queue = Queue()
        self.result_consumer_thread = threading.Thread(target=self.get_result)
        self.result_consumer_thread.start()
        ''' idle worker information and client notification '''
        self.waiting_clients = []
        self.workers_empty_dict = {}
        self.workers = []
        self.workers_data = {}
	self.workers_lock = threading.Lock()
        self.workers_empty = 0
        self.task_id = 0
        ''' to notify workers about new queues '''
        self.new_queues = []
        self.new_queues_lock = threading.Lock()
        ''' here I have: the thread that listens for messages
            a queue in which the tasks are put
            the main thread that applies some reconfiguration (?)
        '''
        self.files_to_delete = []
Exemple #3
0
    def __init__(self):
        self.identity = 'sched-' + socket.gethostbyname(socket.gethostname())
        self.sched_client_thread = ZmqConnectionThread(
            self.identity,
            zmq.DEALER,
            config.SCHEDULER+":" + str(config.ZMQ_SCHEDULER_PORT),
            self.callback)
        self.monitor_thread = NodeMonitor()
        self.running = True
        logfile = config.LOGDIR + "/local_scheduler.log"
        self.logger = WeaselLogger('local_scheduler', logfile)
        self.capacity = self.monitor_thread.capacity
        self.max_tasks_to_run = {}
        ''' the starting number of tasks is defined based on the slot size '''
        self.ntasks_to_ask = 1
        self.task_id = 1
        self.time_asked_first = time.time()
        self.time_from_last_ask = -1
        ''' this is to keep track of number of running tasks ? '''
        self.running_task = 0
        self.nran_tasks = []
        self.time_from_last_ask = time.time()
        self.queues_asked_for = []
        self.current_ntasks = 1
        self.has_new_task = False
        self.is_profiling = False
        self.first_task = False
        self.task_data = {}
        self.t_avg = {}
        self.task_data_lock = threading.Lock()
        self.running_task_lock = threading.Lock()
        self.average_utilization = {'cpu': 0.0, 'memory': 0.0, 'network': 0.0}
        self.average_task_exec_time = 0.0
        self.sleep_time = config.WAITTIME
        self.past_speed_changes = []
        # 'id': id, 'tpool': threadPool, 'rvector': resource_characteristics
        self.queue_data = {}
	self.task_time = 1
        self.queue_data_lock = threading.Lock()
        self.has_new_queue = False
        self.new_queues = []
        self.message_to_send = None
	''' this is to control how many tasks to run in parallel'''
        self.logger.info("NodeScheduler started...")
        self.nrunning_past_period = []
Exemple #4
0
	def __init__(self):
		interface = config.INTERFACE
		f = os.popen('ifconfig ' + str(interface) + ' | grep "inet\ addr" | cut -d: -f2 | cut -d" " -f1')
		self.identity = f.read().strip()
		self.queue_data_lock = threading.Lock()

		# Make a random hash for the queue.
		self.random_hash = hashlib.sha1(b'/tmp/Weasel/bin/local_resourcemanager').hexdigest()
		self.task_data = {}
		self.queue_data = {}
		self.max_tasks_to_run_for_queue = {}

		# Will hold the timestamp when we first received a task.
		self.time_start_running = -1

		# Initially we run four tasks per CPU core. This number might
		# be changed by the monitor thread based on resource contention.
		if config.ADAPT_TASKS:
			self.max_tasks_to_run = multiprocessing.cpu_count() * 2
		else:
			self.max_tasks_to_run = config.NR_COLOCATED_TASKS

		# Create the queue for the tasks.
		self.queue_data[self.random_hash] = {'qid': self.random_hash,
											 'asked': 0, 'recv': 0,
											 'tpool': ThreadPool(1, self.task_data)}
		tpool = self.queue_data[self.random_hash]['tpool']
		self.max_tasks_to_run_for_queue[self.random_hash] = self.max_tasks_to_run
		tpool.set_size(self.max_tasks_to_run)
		tpool.start()

		# Create the monitoring thread.
		self.monitor_thread = NodeMonitor(parent=self)

		# Create the thread that performs communication with
		# the scheduler.
		self.sched_client_thread = ZmqConnectionThread(
			self.identity,
			zmq.DEALER,
			config.SCHEDULER + ":" + str(config.ZMQ_SCHEDULER_PORT),
			self.callback)
		self.running = True
		logfile = config.LOGDIR + "/local_scheduler.log"
		self.logger = WeaselLogger('local_scheduler', logfile)
		self.ntasks_to_ask = 1
		self.task_id = 1
		self.time_asked_first = time.time()
		self.running_task = 0
		self.nr_received_tasks = 0
		self.nran_tasks = []
		self.queues_asked_for = []
		self.current_ntasks = 1
		self.has_new_task = False
		self.first_task = False
		self.running_task_lock = threading.Lock()

		# Will hold a map of nr_colocated_tasks => [runtimes]
		self.task_runtimes = {}
		self.task_runtime_lock = threading.Lock()
		self.sleep_time = config.WAITTIME
		self.task_time = 1
		self.has_new_queue = False
		self.new_queues = []
		self.logger.info("NodeScheduler started...")
		self.nrunning_past_period = []

		# Will hold the last started executable.
		self.last_started = None
Exemple #5
0
class NodeScheduler(object):
	def __init__(self):
		interface = config.INTERFACE
		f = os.popen('ifconfig ' + str(interface) + ' | grep "inet\ addr" | cut -d: -f2 | cut -d" " -f1')
		self.identity = f.read().strip()
		self.queue_data_lock = threading.Lock()

		# Make a random hash for the queue.
		self.random_hash = hashlib.sha1(b'/tmp/Weasel/bin/local_resourcemanager').hexdigest()
		self.task_data = {}
		self.queue_data = {}
		self.max_tasks_to_run_for_queue = {}

		# Will hold the timestamp when we first received a task.
		self.time_start_running = -1

		# Initially we run four tasks per CPU core. This number might
		# be changed by the monitor thread based on resource contention.
		if config.ADAPT_TASKS:
			self.max_tasks_to_run = multiprocessing.cpu_count() * 2
		else:
			self.max_tasks_to_run = config.NR_COLOCATED_TASKS

		# Create the queue for the tasks.
		self.queue_data[self.random_hash] = {'qid': self.random_hash,
											 'asked': 0, 'recv': 0,
											 'tpool': ThreadPool(1, self.task_data)}
		tpool = self.queue_data[self.random_hash]['tpool']
		self.max_tasks_to_run_for_queue[self.random_hash] = self.max_tasks_to_run
		tpool.set_size(self.max_tasks_to_run)
		tpool.start()

		# Create the monitoring thread.
		self.monitor_thread = NodeMonitor(parent=self)

		# Create the thread that performs communication with
		# the scheduler.
		self.sched_client_thread = ZmqConnectionThread(
			self.identity,
			zmq.DEALER,
			config.SCHEDULER + ":" + str(config.ZMQ_SCHEDULER_PORT),
			self.callback)
		self.running = True
		logfile = config.LOGDIR + "/local_scheduler.log"
		self.logger = WeaselLogger('local_scheduler', logfile)
		self.ntasks_to_ask = 1
		self.task_id = 1
		self.time_asked_first = time.time()
		self.running_task = 0
		self.nr_received_tasks = 0
		self.nran_tasks = []
		self.queues_asked_for = []
		self.current_ntasks = 1
		self.has_new_task = False
		self.first_task = False
		self.running_task_lock = threading.Lock()

		# Will hold a map of nr_colocated_tasks => [runtimes]
		self.task_runtimes = {}
		self.task_runtime_lock = threading.Lock()
		self.sleep_time = config.WAITTIME
		self.task_time = 1
		self.has_new_queue = False
		self.new_queues = []
		self.logger.info("NodeScheduler started...")
		self.nrunning_past_period = []

		# Will hold the last started executable.
		self.last_started = None

	def run_task(self, arg):
		command_id = arg['id']
		command = arg['exec'] + ' ' + arg['params']
		qid = arg['qid']
		myid = threading.current_thread().ident

		# Tell the monitor thread we have started a task.
		self.monitor_thread.task_started(myid)

		# Increment the number of running tasks.
		self.running_task_lock.acquire()
		self.running_task += 1
		nr_colocated = self.running_task

		# The first time we run a task of a different type, we reset
		# the history on the monitor thread.
		if self.last_started is None:
			self.last_started = arg['exec']
		if self.last_started != arg['exec']:
			print('Started new task type: ' + str(arg['exec']))
			sys.stdout.flush()
			self.monitor_thread.reset_known_points()
			self.last_started = arg['exec']
		self.running_task_lock.release()

		start_time = time.time()
		proc = psutil.Popen(command, shell=True,
							stdout=PIPE, stderr=PIPE)
		self.task_data[myid]['lock'].acquire()
		self.task_data[myid]['proc'] = proc
		self.task_data[myid]['ctask'] = arg
		self.task_data[myid]['lock'].release()
		out, err = proc.communicate()
		return_code = proc.returncode
		if return_code != 0:
			print('Error when returning: ' + str(return_code))
			sys.stdout.flush()
		end_time = time.time()

		# Record task running times.
		self.task_runtime_lock.acquire()
		running_time = end_time - start_time
		if nr_colocated not in self.task_runtimes:
			self.task_runtimes[nr_colocated] = []
		self.task_runtimes[nr_colocated].append(running_time)
		print("Task %s ran in %s seconds (%s)" % (str(command_id), str(running_time), str(arg['exec'])))
		sys.stdout.flush()
		self.task_runtime_lock.release()

		# Tell the monitor thread we have finished a task.
		self.monitor_thread.task_finished(myid)

		self.task_data[myid]['lock'].acquire()
		self.task_data[myid]['ctask'] = None
		if self.task_data[myid]['task'].get(qid) == None:
			self.task_data[myid]['task'][qid] = []
			self.external_change = True
		self.task_data[myid]['task'][qid].append(
			[end_time - start_time, 100 * (end_time - start_time) / (end_time - start_time)])
		self.task_data[myid]['lock'].release()
		self.running_task_lock.acquire()
		self.running_task -= 1
		self.nran_tasks.append(command_id)
		self.running_task_lock.release()

	def get_total_queue_size(self):
		queue_size = 0
		self.queue_data_lock.acquire()
		for qid in self.queue_data:
			queue_size = queue_size + self.queue_data[qid]['tpool'].tasks.qsize()
		self.queue_data_lock.release()
		return queue_size

	def get_tasks_to_ask(self):
		"""
		Returns the number of tasks to ask from the scheduler. Tries to keep the queue size at
		least as long as the maximum allowed number of tasks at the moment.
		:return:
		"""
		tasks_to_ask = {}
		self.queues_asked_for = []
		queue_size = self.get_total_queue_size()
		self.queue_data_lock.acquire()
		for qid in self.queue_data:
			tasks_to_ask[qid] = 0
			self.queue_data[qid]['asked'] = 0
			self.queue_data[qid]['recv'] = 0
			qsize = self.queue_data[qid]['tpool'].tasks.qsize()
			if qsize > 2 * self.max_tasks_to_run_for_queue[qid] and self.max_tasks_to_run_for_queue[qid] != -1:
				continue
			if qsize == 0:
				tasks_to_ask[qid] = self.max_tasks_to_run_for_queue[qid]
			else:
				if qsize > self.max_tasks_to_run_for_queue[qid] and self.max_tasks_to_run_for_queue[qid] != -1:
					continue
				elif qsize < self.max_tasks_to_run_for_queue[qid]:
					tasks_to_ask[qid] = self.max_tasks_to_run_for_queue[qid] - qsize
			self.queues_asked_for.append(qid)
			self.queue_data[qid]['asked'] = tasks_to_ask[qid]
		self.queue_data_lock.release()
		return tasks_to_ask, queue_size

	def wait_and_ask(self):
		while self.running:
			# check at 0.2 seconds
			time.sleep(0.2)

			self.running_task_lock.acquire()
			nrunning = self.running_task
			self.nrunning_past_period.append(nrunning)
			task_data_to_send = {'ran': self.nran_tasks[:]}
			self.nran_tasks = []
			self.running_task_lock.release()
			(tasks_to_ask, queue_size) = self.get_tasks_to_ask()
			task_data_to_send['qsize'] = queue_size * self.task_time
			pickled_data = pickle.dumps(task_data_to_send)
			if len(tasks_to_ask) > 0:
				self.sched_client_thread.put_request_in_queue(
					[self.identity, PROTOCOL_HEADERS['WORKER'], 'task', pickle.dumps(tasks_to_ask), pickled_data])

	def process_task(self, task):
		tmp = task.split(';')
		task_name = tmp[-1].split()[0].split('/')[-1]
		new_task = False
		return new_task

	def add_task_to_queues(self, tasks):
		for task in tasks['tasks']:
			self.running_task_lock.acquire()
			self.nr_received_tasks += 1
			self.running_task_lock.release()
			new_task = self.process_task(task['exec'])
			task_hash = hashlib.sha1(task['exec'].encode()).hexdigest()
			self.has_new_task |= new_task
			task['qid'] = task_hash
			self.queue_data[self.random_hash]['tpool'].add_task(self.run_task, task)

	def get_latest_task_type(self):
		"""
		Return the latest started task type.
		:return:
		"""
		self.running_task_lock.acquire()
		latest = self.last_started
		self.running_task_lock.release()
		return latest

	def running_identical_tasks(self):
		"""
		Returns whether or not the worker is currently only running tasks of the same type.
		:return:
		"""
		current = None
		task_threads = self.task_data.keys()
		try:
			for task_thread in task_threads:
				if (task_thread not in self.task_data) or ('lock' not in self.task_data[task_thread]):
					continue
				self.task_data[task_thread]['lock'].acquire()
				if 'ctask' in self.task_data[task_thread] and self.task_data[task_thread]['ctask'] is not None:
					if current is None:
						current = self.task_data[task_thread]['ctask']['exec']
					elif current != self.task_data[task_thread]['ctask']['exec']:
						self.task_data[task_thread]['lock'].release()
						return False
				self.task_data[task_thread]['lock'].release()
			return True
		except Exception, e:
			print('Got exception while trying to determine identical tasks')
			print(e)
			sys.stdout.flush()
Exemple #6
0
def make_request(identity, header, message):
    request = [identity, header, message]
    return request


def send_message(socket, message):
    socket.send_multipart(message)


def get_reply(socket):
    return socket.recv_multipart()


log_name = 'notification'
notification_logger = WeaselLogger(log_name,
                                   config.LOGDIR + '/' + log_name + '.log')


class ZmqConnectionThread(Thread):
    def __init__(self, identity, socket_type, address, msg_process_callback):
        Thread.__init__(self)
        self.running = True
        self.requests = []
        self.lock = Lock()
        self.msg_process_callback = msg_process_callback
        self.context = get_context()
        self.frontend = tmp_socket(self.context, socket_type, identity,
                                   address)
        self.frontend.setsockopt(zmq.LINGER, 120000)
        self.poll = zmq.Poller()
        self.backup_socket = None
Exemple #7
0
class Scheduler(object):
    stdin = "/dev/null"
    stdout = "/dev/null"
    stderr = "/dev/null"

    def __init__(self):
        self.running = True
        self.resource_queues = {'cpu': [],
                                'memory': [],
                                'bw': []}
        ''' this is to have multiple applications/queues; application = queue '''
        self.task_queue = []  # ready queues; [queue_id]
        self.task_queue_data = {}  # ready queue data; queue_id: [task...]
        # queues with pending tasks, which do not have the input ready
        self.pending_queue_data = {}
        self.task_queue_lock = threading.Lock()
        self.pending_task_queue_lock = threading.Lock()
        logfile = config.LOGDIR + "/scheduler.log"
        taskfile = config.LOGDIR + "/task.log"
        self.logger = WeaselLogger('scheduler', logfile)
        self.task_logger = WeaselLogger('tasklogger', logfile)
        ''' this is the thread that will perform the communication with the local schedulers '''
        self.server_thread = ZmqConnectionThread(
            'resourcemng',
            zmq.ROUTER,
            "*:" + str(
                config.ZMQ_SCHEDULER_PORT),
            self.msg_process_callback)
        ''' this information is for keeping track of files and task dependencies '''
        self.task_to_file = {
        }  # taskid: {'nfiles': 0, 'ntotalfiles':x, 'outputs':[files]}
        # file: {'ntids': ntids, 'tids': [tids], 'ctids': executed_tasks}
        self.file_to_task = {}
        self.result_queue = Queue()
        self.result_consumer_thread = threading.Thread(target=self.get_result)
        self.result_consumer_thread.start()
        ''' idle worker information and client notification '''
        self.waiting_clients = []
        self.workers_empty_dict = {}
        self.workers = []
        self.workers_data = {}
	self.workers_lock = threading.Lock()
        self.workers_empty = 0
        self.task_id = 0
        ''' to notify workers about new queues '''
        self.new_queues = []
        self.new_queues_lock = threading.Lock()
        ''' here I have: the thread that listens for messages
            a queue in which the tasks are put
            the main thread that applies some reconfiguration (?)
        '''
        self.files_to_delete = []

    def delete_queue(self, qid):
        del self.task_queue_data[qid]
        self.task_queue.remove(qid)
        self.new_queues_lock.acquire()
        try:
            self.new_queues.remove(qid)
        except:
            pass
        self.new_queues_lock.release()
        
        
    def get_taskids(self):
        tasks_ids = []
        try:
            tasks_ids = self.result_queue.get(
                    block=True,
                    timeout=8 *
                    config.WAITTIME)
        except:
            self.pending_task_queue_lock.acquire()
            pending_queue = self.pending_queue_data
            for pqueue in pending_queue:
                to_delete = []
                for tid in pending_queue[pqueue]:
                    current_inputs = 0
                    tinputs = len(pending_queue[pqueue][tid]['inputs'])
                    for inputf in pending_queue[pqueue][tid]['inputs']:
                        if os.path.isfile(inputf):
                            current_inputs = current_inputs + 1
                        #else:
                        #    print "Missing file ", inputf
                    if current_inputs == tinputs:
                        task = pending_queue[pqueue][tid]
                        data = {
                                'id': tid,
                                'exec': task['exec'],
                                'params': task['params']}
                        to_delete.append(tid)
                        self.task_queue_lock.acquire()
                        is_new = False
                        try:
                            self.task_queue_data[pqueue].append(data)
                        except:
                            self.task_queue.append(pqueue)
                            self.task_queue_data[pqueue] = deque()
                            self.task_queue_data[pqueue].append(data)
                            is_new = True
                        self.task_queue_lock.release()
                        if is_new:
                            self.new_queues_lock.acquire()
                            self.new_queues.append(pqueue)
                            self.new_queues_lock.release()
                for tid in to_delete:
                    del self.pending_queue_data[pqueue][tid]
            self.pending_task_queue_lock.release()
        return tasks_ids
    
    def check_dependencies_per_task(self, taskid):
        for fileid in self.task_to_file[taskid]['outputs']:
            dependent_tasks = []
            try:
                dependent_tasks = self.file_to_task[
                            hashlib.sha1(fileid.encode()).hexdigest()]['tids']
            except:
                pass
            for taskid2 in dependent_tasks:
                self.task_to_file[taskid2]['cinputs'] = self.task_to_file[taskid2]['cinputs'] + 1
                if self.task_to_file[taskid2]['cinputs'] == self.task_to_file[taskid2]['tinputs']:
                    # put in ready queue
                    self.pending_task_queue_lock.acquire()
		    try:
                        task = self.pending_queue_data[self.task_to_file[taskid2]['queueid']][taskid2]
                    except:
			self.pending_task_queue_lock.release()
                        continue
		    self.pending_task_queue_lock.release()
                    data = {
                            'id': taskid2,
                            'exec': task['exec'],
                            'params': task['params']}
                    self.task_queue_lock.acquire()
                    is_new = False
                    try:
                        self.task_queue_data[self.task_to_file[taskid2]['queueid']].append(data)
                    except:
                        self.task_queue.append(self.task_to_file[taskid2]['queueid'])  # FCFS like
                        self.task_queue_data[self.task_to_file[taskid2]['queueid']] = deque()
                        self.task_queue_data[self.task_to_file[taskid2]['queueid']].append(data)
                        is_new = True
                    self.task_queue_lock.release()
                    if is_new:
                        self.new_queues_lock.acquire()
                        self.new_queues.append(self.task_to_file[taskid2]['queueid'])
                        self.new_queues_lock.release()
		    self.pending_task_queue_lock.acquire()
                    del self.pending_queue_data[
                                self.task_to_file[taskid2]['queueid']][taskid2]
                    self.pending_task_queue_lock.release()

    def garbage_collect(self, taskid):
        for fileid in self.task_to_file[taskid]['inputs']:
            try:
                self.file_to_task[
                                hashlib.sha1(
                                    fileid.encode()).hexdigest()]['ctids'] = self.file_to_task[
                                hashlib.sha1(
                                    fileid.encode()).hexdigest()]['ctids'] + 1
                if self.file_to_task[
                                hashlib.sha1(
                                    fileid.encode()).hexdigest()]['ctids'] == self.file_to_task[
                                hashlib.sha1(
                                    fileid.encode()).hexdigest()]['ntids']:
                                # now it is safe to delete this file (is it?)
                    try:
                        print "deleting file ", fileid, "ctids=", \
                            self.file_to_task[hashlib.sha1(fileid.encode()).hexdigest()]['ctids'] \
                            , "ntids=", self.file_to_task[hashlib.sha1(fileid.encode()).hexdigest()]['ntids']
                        self.logger.debug("File: %s dependent_tasks: %s" %
                                      (fileid, self.file_to_task[
                                        hashlib.sha1(
                                        fileid.encode()).hexdigest()]['ntids']))
                                    # os.remove(fileid)
                                # if failed, report it back to the user ##
                    except OSError as e:
                        print "Error: %s - %s." % (e.filename, e.strerror)
            except:
                print "exception for file ", fileid

    def get_result(self):
        while self.running:
            self.pending_task_queue_lock.acquire()
            to_delete = []
            for pqueue in self.pending_queue_data:
                if len(self.pending_queue_data[pqueue]) == 0:
                    to_delete.append(pqueue)
            for pqueue in to_delete:
                del self.pending_queue_data[pqueue]
            self.pending_task_queue_lock.release()
            tasks_ids = self.get_taskids()
            if len(tasks_ids) == 0:
                continue
            for taskid in tasks_ids:
                # if missing files were generated put task in ready queue
		try:
		    self.check_dependencies_per_task(taskid)
		except:
		    traceback.print_exc()
	'''
            for taskid in tasks_ids:
                try:
                    self.garbage_collect(taskid)
                except:
                    print "I cannot find ", taskid, "in task_to_file"
	'''

    def msg_process_callback(self, message):
        message_type = message[3]
        try:
	    if message_type == PROTOCOL_HEADERS['CLIENT']:
                self.process_client_message(message)
            elif message_type == PROTOCOL_HEADERS['WORKER']:
                self.process_worker_message(message)
	except:
	    traceback.print_exc()


    def process_queue(self, data, qid):
	task_data = pickle.loads(data)
	self.task_queue_lock.acquire()
        self.pending_task_queue_lock.acquire()
	for task in task_data:
	    self.process_queue_task(task, qid) 
	self.pending_task_queue_lock.release()
        self.task_queue_lock.release() 


    def process_queue_task(self, task_data, qid):
        self.task_id = self.task_id + 1
        splited_inputs = task_data['inputs'].split()
        total_inputs = len(splited_inputs)
        current_inputs = 0
        for inputf in splited_inputs:
            if os.path.isfile(inputf):
                current_inputs = current_inputs + 1
            #else:
            #    print "Missing file: ", inputf
            try:
                self.file_to_task[
                        hashlib.sha1(
                            inputf.encode()).hexdigest()]['tids'].append(
                        self.task_id)
                self.file_to_task[
                        hashlib.sha1(
                            inputf.encode()).hexdigest()]['ntids'] = self.file_to_task[
                        hashlib.sha1(
                            inputf.encode()).hexdigest()]['ntids'] + 1
            except:
                self.file_to_task[
                        hashlib.sha1(
                            inputf.encode()).hexdigest()] = {
                        'ntids': 1,
                        'tids': [],
                        'ctids': 0}
                self.file_to_task[
                        hashlib.sha1(
                            inputf.encode()).hexdigest()]['tids'] = [
                        self.task_id]
        self.task_to_file[self.task_id] = {
                'queueid': qid,
                'tinputs': total_inputs,
                'cinputs': current_inputs,
                'inputs': splited_inputs,
                'outputs': task_data['outputs'].split()}
        if current_inputs == total_inputs:
            print "Putting task ", self.task_id, " in active queue", qid
            if qid not in self.task_queue:
                try:
                    self.new_queues_lock.acquire()
                    self.new_queues.append(qid)
                    self.new_queues_lock.release()
                    self.task_queue.append(qid)  # FCFS like
                    self.task_queue_data[qid] = deque()
                except:
                    traceback.print_exc()
            self.task_queue_data[qid].append(
                    {'id': self.task_id, 'exec': task_data['exec'], 'params': task_data['params']})
        else:
            self.logger.info(
                    "Putting task %s in pending queue, total inputs: %s" %
                    (self.task_id, total_inputs))
            task_info = {
                    'id': self.task_id,
                    'exec': task_data['exec'],
                    'params': task_data['params'],
                    'inputs': splited_inputs}
            try:
                self.pending_queue_data[qid][self.task_id] = task_info
            except:
                self.pending_queue_data[qid] = {}
                self.pending_queue_data[qid][self.task_id] = task_info
 
    def process_status(self, qid):
        self.task_queue_lock.acquire()
        ntasks = len(self.task_queue_data[qid])
        self.task_queue_lock.release()
        reply = [
                message[0],
                PROTOCOL_HEADERS['RSMNG'],
                'status',
                str(ntasks)]
        self.server_thread.put_request_in_queue(reply)

        
    def process_wait(self, qid, message):
        self.workers_lock.acquire()
	self.workers_emtpy = 0
        self.workers_empty_dict = {}
	self.workers_lock.release()
        self.waiting_clients.append(message[0])
        
    def process_delete(self, data):
        worker_id = 'sched-' + data
        with open(os.devnull, 'w') as devnull:
        	proc = subprocess.Popen(
                    "ssh %s \" pkill -9 local_resourcem \" " %
                    data,
                    stdout=devnull,
                    stderr=devnull)
        	proc.wait()
        # we delete all info about the worker....
        self.workers_lock.acquire()
        self.workers.remove(worker_id)
        del self.workers_data[worker_id]
	self.workers_lock.release()

    def process_client_message(self, message):
        tmp_msg = message[4:]
        qid = message[1]
        action = tmp_msg[0]
        data = None
        if len(tmp_msg) > 1:
            data = tmp_msg[1]
	print "I receive message for queue ", qid, " action: ", action
        if action == 'queue':
            if data is None:
                #print "Missing task information"
                return
            self.process_queue(data, qid)
        elif action == 'status':
            self.process_status(qid)
        elif action == 'wait':
            self.process_wait(qid, message)
        elif action == 'clear':
            print "I will empty the worker queues"
            for worker in workers:
                self.server_thread.put_request_in_queue(
                    [worker, PROTOCOL_HEADERS['RSMNG'], 'empty'])
        elif action == 'delete':
            self.process_delete(data)
        else:
            print "Not implemented yet"


    def process_worker_nonempty_queue_static(self, tasks_per_queue, answer):
        queue_id = self.task_queue[0]
	ntasks = min(
                len(self.task_queue_data[queue_id]), int(tasks_per_queue))
        for i in range(0, ntasks):
            task = self.task_queue_data[queue_id].popleft()
            answer['tasks'].append(task)
        if len(self.task_queue_data[queue_id]) == 0:
            self.delete_queue(queue_id)
	return answer
                
    def process_worker_nonempty_queue_dynamic(self, queue_id, tasks_per_queue, answer):
        # send from each queue
	print "[Empty] Asked ", queue_id, "tasks per queue ", tasks_per_queue
        to_delete = []
	for qid in self.task_queue_data:
            # if the active queue len is smaller than what the worker asked
            # and the pending queue len is larger than that, do
            # not send?
            req_tasks = int(tasks_per_queue)
	    self.pending_task_queue_lock.acquire()
	    pending_qid = self.pending_queue_data.get(qid)
	    pending_qid_len = 0
	    if pending_qid != None:
	        pending_qid_len = len(pending_qid)
	    self.pending_task_queue_lock.release()
            #if pending_qid:
            #    if pending_qid_len > req_tasks and len(
            #        self.task_queue_data[qid]) < req_tasks:
	    #	    print "I have in queue less than asked and in pending queue more!"
            #        continue
            ntasks = min(
                        len(self.task_queue_data[qid]), req_tasks)
            answer['queues'][qid] = ntasks
            print "Asked for ", tasks_per_queue, "Sending ", ntasks, "from queue ", qid, \
                                " from max ntasks ", len(self.task_queue_data[qid])
            for i in range(0, ntasks):
                task = self.task_queue_data[qid].popleft()
                answer['tasks'].append(task)
            if ntasks > 0 and len(
                self.task_queue_data[qid]) == 0:
		to_delete.append(qid)                

	for qid in to_delete:
	    self.delete_queue(qid)
	return answer

    def process_worker_nonempty_queue_static1(self, tasks_per_queue, answer):
        # send only from the first queue
	queue_id = self.task_queue[0]
        if queue_id not in tasks_per_queue:
            ntasks = min(len(self.task_queue_data[queue_id]), int(tasks_per_queue.values()[0]))
        else:
            ntasks = min(len(self.task_queue_data[queue_id]), int(tasks_per_queue[queue_id]))
        for i in range(0, ntasks):
            task = self.task_queue_data[queue_id].popleft()
            answer['tasks'].append(task)
        if len(self.task_queue_data[queue_id]) == 0:
            self.delete_queue(queue_id)
	return answer 
                
    def process_worker_nonempty_queue_dynamic1(self, tasks_per_queue, answer):
	print "####################### Asked ", tasks_per_queue
        for qid in tasks_per_queue:
            # for each queue from which the worker asks check
            # if I have enough tasks to send
            req_tasks = int(tasks_per_queue[qid])
	    self.pending_task_queue_lock.acquire()
            if self.pending_queue_data.get(qid) and self.task_queue_data.get(qid):
                if len(self.task_queue_data) > 1 and len(self.pending_queue_data[qid]) > req_tasks and len(
                        self.task_queue_data[qid]) < req_tasks:
		    self.pending_task_queue_lock.release()
                    continue
            self.pending_task_queue_lock.release()
	    try:
                ntasks = min(len(self.task_queue_data[qid]), req_tasks)
            except:
                answer['queues'][qid] = -1
                ntasks = -1
                continue
	    answer['queues'][qid] = ntasks
            if ntasks > 0:
		print "Sending ", ntasks, " from queue ", qid
                for i in range(0, ntasks):
                    task = self.task_queue_data[qid].popleft()
                    answer['tasks'].append(task)
                if len(self.task_queue_data[qid]) == 0:
                    self.delete_queue(qid)
	to_delete = []
	return answer

    def process_worker_nonempty_queue(self, worker_id, type, data):
        ''' FCFS queue for the static policy '''
        randn = 0
        queue_id = self.task_queue[randn]
        answer = {'queues': {}, 'tasks': []}
        if type == 'task_empty':
            tasks_per_queue = data
            if config.POLICY == 'static':
                # send from the first queue
		answer = self.process_worker_nonempty_queue_static(tasks_per_queue, answer)
            else:
                answer =self.process_worker_nonempty_queue_dynamic(queue_id, tasks_per_queue, answer)
        else:
            tasks_per_queue = pickle.loads(data)
	    print tasks_per_queue
            if config.POLICY == "static":
                answer = self.process_worker_nonempty_queue_static1(tasks_per_queue, answer)
            else:
                answer = self.process_worker_nonempty_queue_dynamic1(tasks_per_queue, answer)
	    for qid in self.task_queue_data:
                if qid not in tasks_per_queue:
                    answer['queues'][qid] = 0
        print "Sending ", len(answer['tasks']), answer['queues'], " to worker ", worker_id
        # TODO : piggyback the ids of other queues, if no queues > 1
        ''' if we don't have any data left in the queue we delete it '''
        ''' if the worker sent us an empty message reset it '''
        self.workers_lock.acquire()
	if self.workers_empty_dict.get(worker_id):
            del self.workers_empty_dict[worker_id]
	self.workers_lock.release()
        return answer

    def process_worker_message(self, message):
        tmp_msg = message[4:]
        type = tmp_msg[0]
        data = None
        first_worker = False
        self.workers_lock.acquire()
        if message[0] not in self.workers:
            self.workers.append(message[0])
	    self.workers_data[message[0]] = 0
            #first_worker = True
        if len(tmp_msg) > 1:
            data = tmp_msg[1]
        if len(tmp_msg) > 2:
            data2 = pickle.loads(tmp_msg[2])
            computed_tasks = data2['ran']
	    worker_current_size = data2['qsize']
	    self.workers_data[message[0]] = float(worker_current_size)
	    if len(computed_tasks) > 0:
                self.result_queue.put(computed_tasks)
	if type == 'task_empty':
	    self.workers_data[message[0]] = 0
        self.workers_lock.release()
	if type == 'task' or type == 'task_empty':
            ''' send x tasks to it '''
            ''' this is a hack; it works only with one client '''
            self.task_queue_lock.acquire()
	    task_queue_len = len(self.task_queue)
	    self.task_queue_lock.release()
	    self.pending_task_queue_lock.acquire()
	    pending_task_queue_len = len(self.pending_queue_data)
	    self.pending_task_queue_lock.release()
	    if type == 'task_empty' and task_queue_len == 0 \
                    and pending_task_queue_len == 0 \
                    and len(self.waiting_clients) > 0:
		self.workers_lock.acquire()
                if not self.workers_empty_dict.get(
                        message[0]) and not first_worker:
                    self.workers_empty_dict[message[0]] = 1
		self.workers_lock.release()
            answer = {'queues': {}, 'tasks': []}
            wake_client = False
            self.task_queue_lock.acquire()
            task_queue_len = len(self.task_queue)
	    sent_tasks = False
	    if task_queue_len > 0:
		sent_tasks = True
                answer = self.process_worker_nonempty_queue(message[0], type, data)
            else:
		print self.workers_empty_dict
		self.workers_lock.acquire()
                if pending_task_queue_len == 0 and len(
                        self.workers_empty_dict) >= len(
                        self.workers):
                    wake_client = True
		self.workers_lock.release()
            self.task_queue_lock.release()
	    if len(answer['tasks']) > 0:
		print "Sending ", answer, " to worker ", message[0]
                data = pickle.dumps(answer)
		self.server_thread.put_request_in_queue(
                    [message[0], PROTOCOL_HEADERS['RSMNG'], 'task', data])
            if wake_client:
                for client in self.waiting_clients:
                    print "Sending wake up message to ", client
                    self.server_thread.put_request_in_queue(
                        [client, PROTOCOL_HEADERS['RSMNG'], 'done'])
                self.waiting_clients = []
                self.workers_empty = 0
		return
	    if sent_tasks:
		return
	    ''' If my queues are empty, we apply the work stealing algorithm.... '''
	elif type == 'output':
            ''' un-serialize the output and append it to a log '''
            output = pickle.loads(data)
            self.task_logger.info(output)

    ''' Main scheduling LOOP '''

    def run(self):
        self.server_thread.start()
        while self.running:
            try:
                self.logger.debug("Scheduling.......")
                ''' compute the number of workers and data nodes for each
                    scheduling period '''
		self.task_queue_lock.acquire()
                print "*** Task queue is: ", self.task_queue
                for aqueue in self.task_queue:
                    print "Queue ", aqueue, "len ", len(self.task_queue_data[aqueue])
                self.task_queue_lock.release()
		self.pending_task_queue_lock.acquire()
		print "Task pending queue is: ", self.pending_queue_data.keys()
                for pqueue in self.pending_queue_data:
                    print "Queue ", pqueue, "len ", len(self.pending_queue_data[pqueue])
                self.pending_task_queue_lock.release()
		if self.running:
                    time.sleep(config.WAITTIME)
            except KeyboardInterrupt:
                self.shutdown()
            except:
                traceback.print_exc(self.logger.path)
                try:
                    time.sleep(config.WAITTIME)
                except KeyboardInterrupt:
                    self.shutdown()
        print "Stopping communication thread...."
        self.logger.info("Stopping communication thread....")
        self.server_thread.stop()
        print "Joining communication thread...."
        self.logger.info("Joining communication thread....")
        self.server_thread.join()
        print "DONE"
        self.logger.info("DONE")
        return

    def shutdown(self):
        print "Received signal to shutdown. "
        self.logger.info("Received signal to shutdown. Will wait for the end of the \
                    scheduling period")
        self.running = False
Exemple #8
0
class Scheduler(object):
	stdin = "/dev/null"
	stdout = "/dev/null"
	stderr = "/dev/null"

	def __init__(self):
		self.running = True
		self.resource_manager = ResourceManager(self)
		self.resource_queues = {'cpu': [],
								'memory': [],
								'bw': []}
		''' this is to have multiple applications/queues; application = queue '''
		self.task_queue = []  # ready queues; [queue_id]
		self.task_queue_data = {}  # ready queue data; queue_id: [task...]
		# queues with pending tasks, which do not have the input ready
		self.pending_queue_data = {}
		self.task_queue_lock = threading.Lock()
		self.pending_task_queue_lock = threading.Lock()
		logfile = config.LOGDIR + "/scheduler.log"
		taskfile = config.LOGDIR + "/task.log"
		self.logger = WeaselLogger('scheduler', logfile)
		self.task_logger = WeaselLogger('tasklogger', logfile)
		''' this is the thread that will perform the communication with the local schedulers '''
		self.server_thread = ZmqConnectionThread(
			'resourcemng',
			zmq.ROUTER,
			"*:" + str(
				config.ZMQ_SCHEDULER_PORT),
			self.msg_process_callback)
		''' this information is for keeping track of files and task dependencies '''
		self.task_to_file = {
		}  # taskid: {'nfiles': 0, 'ntotalfiles':x, 'outputs':[files]}
		# file: {'ntids': ntids, 'tids': [tids], 'ctids': executed_tasks}
		self.file_to_task = {}
		self.result_queue = Queue()
		self.result_consumer_thread = threading.Thread(target=self.get_result)
		self.result_consumer_thread.start()
		''' idle worker information and client notification '''
		self.waiting_clients = []
		self.workers_empty_dict = {}
		self.workers = []
		self.workers_data = {}
		self.workers_lock = threading.Lock()
		self.workers_empty = 0
		self.task_id = 0
		''' to notify workers about new queues '''
		self.new_queues = []
		self.new_queues_lock = threading.Lock()
		''' here I have: the thread that listens for messages
			a queue in which the tasks are put
			the main thread that applies some reconfiguration (?)
		'''
		self.files_to_delete = []

	def delete_queue(self, qid):
		del self.task_queue_data[qid]
		self.task_queue.remove(qid)
		self.new_queues_lock.acquire()
		try:
			self.new_queues.remove(qid)
		except:
			pass
		self.new_queues_lock.release()

	def get_taskids(self):
		tasks_ids = []
		try:
			tasks_ids = self.result_queue.get(
				block=True,
				timeout=8 *
						config.WAITTIME)
		except:
			self.pending_task_queue_lock.acquire()
			pending_queue = self.pending_queue_data
			for pqueue in pending_queue:
				to_delete = []
				for tid in pending_queue[pqueue]:
					current_inputs = 0
					tinputs = len(pending_queue[pqueue][tid]['inputs'])
					for inputf in pending_queue[pqueue][tid]['inputs']:
						if os.path.isfile(inputf):
							current_inputs = current_inputs + 1
					if current_inputs == tinputs:
						task = pending_queue[pqueue][tid]
						data = {
							'id': tid,
							'exec': task['exec'],
							'params': task['params']}
						to_delete.append(tid)
						self.task_queue_lock.acquire()
						is_new = False
						try:
							self.task_queue_data[pqueue].append(data)
						except:
							self.task_queue.append(pqueue)
							self.task_queue_data[pqueue] = deque()
							self.task_queue_data[pqueue].append(data)
							is_new = True
						self.task_queue_lock.release()
						if is_new:
							self.new_queues_lock.acquire()
							self.new_queues.append(pqueue)
							self.new_queues_lock.release()
				for tid in to_delete:
					del self.pending_queue_data[pqueue][tid]
			self.pending_task_queue_lock.release()
		return tasks_ids

	def check_dependencies_per_task(self, taskid):
		for fileid in self.task_to_file[taskid]['outputs']:
			dependent_tasks = []
			try:
				dependent_tasks = self.file_to_task[
					hashlib.sha1(fileid.encode()).hexdigest()]['tids']
			except:
				pass
			for taskid2 in dependent_tasks:
				self.task_to_file[taskid2]['cinputs'] = self.task_to_file[taskid2]['cinputs'] + 1
				if self.task_to_file[taskid2]['cinputs'] == self.task_to_file[taskid2]['tinputs']:
					# put in ready queue
					self.pending_task_queue_lock.acquire()
					try:
						task = self.pending_queue_data[self.task_to_file[taskid2]['queueid']][taskid2]
					except:
						self.pending_task_queue_lock.release()
						continue
					self.pending_task_queue_lock.release()
					data = {
						'id': taskid2,
						'exec': task['exec'],
						'params': task['params']}
					self.task_queue_lock.acquire()
					is_new = False
					try:
						self.task_queue_data[self.task_to_file[taskid2]['queueid']].append(data)
					except:
						self.task_queue.append(self.task_to_file[taskid2]['queueid'])  # FCFS like
						self.task_queue_data[self.task_to_file[taskid2]['queueid']] = deque()
						self.task_queue_data[self.task_to_file[taskid2]['queueid']].append(data)
						is_new = True
					self.task_queue_lock.release()
					if is_new:
						self.new_queues_lock.acquire()
						self.new_queues.append(self.task_to_file[taskid2]['queueid'])
						self.new_queues_lock.release()
					self.pending_task_queue_lock.acquire()
					del self.pending_queue_data[
						self.task_to_file[taskid2]['queueid']][taskid2]
					self.pending_task_queue_lock.release()

	def garbage_collect(self, taskid):
		for fileid in self.task_to_file[taskid]['inputs']:
			try:
				self.file_to_task[
					hashlib.sha1(
						fileid.encode()).hexdigest()]['ctids'] = self.file_to_task[
																	 hashlib.sha1(
																		 fileid.encode()).hexdigest()]['ctids'] + 1
				if self.file_to_task[
					hashlib.sha1(
						fileid.encode()).hexdigest()]['ctids'] == self.file_to_task[
					hashlib.sha1(
						fileid.encode()).hexdigest()]['ntids']:
					# now it is safe to delete this file (is it?)
					try:
						print "deleting file ", fileid, "ctids=", \
							self.file_to_task[hashlib.sha1(fileid.encode()).hexdigest()]['ctids'] \
							, "ntids=", self.file_to_task[hashlib.sha1(fileid.encode()).hexdigest()]['ntids']
						# os.remove(fileid)
						# if failed, report it back to the user ##
					except OSError as e:
						print "Error: %s - %s." % (e.filename, e.strerror)
			except:
				print "exception for file ", fileid

	def get_result(self):
		while self.running:
			self.pending_task_queue_lock.acquire()
			to_delete = []
			for pqueue in self.pending_queue_data:
				if len(self.pending_queue_data[pqueue]) == 0:
					to_delete.append(pqueue)
			for pqueue in to_delete:
				del self.pending_queue_data[pqueue]
			self.pending_task_queue_lock.release()
			tasks_ids = self.get_taskids()
			if len(tasks_ids) == 0:
				continue
			for taskid in tasks_ids:
				# if missing files were generated put task in ready queue
				try:
					self.check_dependencies_per_task(taskid)
				except:
					traceback.print_exc()

	def msg_process_callback(self, message):
		message_type = message[3]
		try:
			if message_type == PROTOCOL_HEADERS['CLIENT']:
				self.process_client_message(message)
			elif message_type == PROTOCOL_HEADERS['WORKER']:
				self.process_worker_message(message)
		except:
			traceback.print_exc()

	def process_queue(self, data, qid):
		task_data = pickle.loads(data)
		# Analyze the given DAG.
		self.resource_manager.initialize_dag(task_data)
		self.task_queue_lock.acquire()
		self.pending_task_queue_lock.acquire()
		for task in task_data:
			self.process_queue_task(task, qid)
		self.pending_task_queue_lock.release()
		self.task_queue_lock.release()

	def process_queue_task(self, task_data, qid):
		self.task_id = self.task_id + 1
		splited_inputs = task_data['inputs'].split()
		total_inputs = len(splited_inputs)
		current_inputs = 0
		for inputf in splited_inputs:
			if os.path.isfile(inputf):
				current_inputs = current_inputs + 1
			try:
				self.file_to_task[hashlib.sha1(inputf.encode()).hexdigest()]['tids'].append(self.task_id)
				self.file_to_task[hashlib.sha1(inputf.encode()).hexdigest()]['ntids'] = self.file_to_task[hashlib.sha1(inputf.encode()).hexdigest()]['ntids'] + 1
			except:
				self.file_to_task[
					hashlib.sha1(
						inputf.encode()).hexdigest()] = {
					'ntids': 1,
					'tids': [],
					'ctids': 0}
				self.file_to_task[
					hashlib.sha1(
						inputf.encode()).hexdigest()]['tids'] = [
					self.task_id]
		self.task_to_file[self.task_id] = {
			'queueid': qid,
			'tinputs': total_inputs,
			'cinputs': current_inputs,
			'inputs': splited_inputs,
			'outputs': task_data['outputs'].split()}
		if current_inputs == total_inputs:
			print "Putting task ", self.task_id, " in active queue", qid
			if qid not in self.task_queue:
				try:
					self.new_queues_lock.acquire()
					self.new_queues.append(qid)
					self.new_queues_lock.release()
					self.task_queue.append(qid)  # FCFS like
					self.task_queue_data[qid] = deque()
				except:
					traceback.print_exc()
			self.task_queue_data[qid].append(
				{'id': self.task_id, 'exec': task_data['exec'], 'params': task_data['params']})
		else:
			print(
				"Putting task %s in pending queue, total inputs: %s" %
				(self.task_id, total_inputs))
			task_info = {
				'id': self.task_id,
				'exec': task_data['exec'],
				'params': task_data['params'],
				'inputs': splited_inputs}
			try:
				self.pending_queue_data[qid][self.task_id] = task_info
			except:
				self.pending_queue_data[qid] = {}
				self.pending_queue_data[qid][self.task_id] = task_info

	def process_status(self, qid):
		self.task_queue_lock.acquire()
		ntasks = len(self.task_queue_data[qid])
		self.task_queue_lock.release()
		reply = [
			message[0],
			PROTOCOL_HEADERS['RSMNG'],
			'status',
			str(ntasks)]
		self.server_thread.put_request_in_queue(reply)

	def process_wait(self, qid, message):
		self.workers_lock.acquire()
		self.workers_emtpy = 0
		self.workers_empty_dict = {}
		self.workers_lock.release()
		self.waiting_clients.append(message[0])

	def process_delete(self, data):
		worker_id = 'sched-' + data
		with open(os.devnull, 'w') as devnull:
			proc = subprocess.Popen(
				"ssh %s \" pkill -9 local_resourcem \" " %
				data,
				stdout=devnull,
				stderr=devnull)
			proc.wait()
		# we delete all info about the worker....
		self.workers_lock.acquire()
		self.workers.remove(worker_id)
		del self.workers_data[worker_id]
		self.workers_lock.release()

	def process_client_message(self, message):
		tmp_msg = message[4:]
		qid = message[1]
		action = tmp_msg[0]
		data = None
		if len(tmp_msg) > 1:
			data = tmp_msg[1]
		print "I receive message for queue ", qid, " action: ", action
		if action == 'queue':
			if data is None:
				# print "Missing task information"
				return
			self.process_queue(data, qid)
		elif action == 'status':
			self.process_status(qid)
		elif action == 'wait':
			self.process_wait(qid, message)
		elif action == 'clear':
			print "I will empty the worker queues"
			for worker in self.workers:
				self.server_thread.put_request_in_queue(
					[worker, PROTOCOL_HEADERS['RSMNG'], 'empty'])
		elif action == 'delete':
			self.process_delete(data)
		else:
			print "Not implemented yet"

	def process_worker_nonempty_queue_static(self, tasks_per_queue, answer):
		queue_id = self.task_queue[0]
		ntasks = min(
			len(self.task_queue_data[queue_id]), int(tasks_per_queue))
		for i in range(0, ntasks):
			task = self.task_queue_data[queue_id].popleft()
			answer['tasks'].append(task)
		if len(self.task_queue_data[queue_id]) == 0:
			self.delete_queue(queue_id)
		return answer

	def process_worker_nonempty_queue_static1(self, tasks_per_queue, answer):
		# send only from the first queue
		queue_id = self.task_queue[0]
		if queue_id not in tasks_per_queue:
			ntasks = min(len(self.task_queue_data[queue_id]), int(tasks_per_queue.values()[0]))
		else:
			ntasks = min(len(self.task_queue_data[queue_id]), int(tasks_per_queue[queue_id]))
		for i in range(0, ntasks):
			task = self.task_queue_data[queue_id].popleft()
			answer['tasks'].append(task)
		if len(self.task_queue_data[queue_id]) == 0:
			self.delete_queue(queue_id)
		return answer

	def process_worker_nonempty_queue(self, worker_id, type, data):
		''' FCFS queue for the static policy '''
		randn = 0
		queue_id = self.task_queue[randn]
		answer = {'queues': {}, 'tasks': []}
		if type == 'task_empty':
			tasks_per_queue = data
			answer = self.process_worker_nonempty_queue_static(tasks_per_queue, answer)
		else:
			tasks_per_queue = pickle.loads(data)
			answer = self.process_worker_nonempty_queue_static1(tasks_per_queue, answer)
			for qid in self.task_queue_data:
				if qid not in tasks_per_queue:
					answer['queues'][qid] = 0
		# TODO : piggyback the ids of other queues, if no queues > 1
		''' if we don't have any data left in the queue we delete it '''
		''' if the worker sent us an empty message reset it '''
		self.workers_lock.acquire()
		if self.workers_empty_dict.get(worker_id):
			del self.workers_empty_dict[worker_id]
		self.workers_lock.release()
		return answer

	def process_worker_message(self, message):
		tmp_msg = message[4:]
		type = tmp_msg[0]

		# If we are dealing with a resource or statistics message, let the resource manager handle it.
		if type == 'resource' or type == 'statistics':
			self.resource_manager.process_worker_message(message)
			return
		else:
			data = None
			first_worker = False
			self.workers_lock.acquire()
			if message[0] not in self.workers:
				self.workers.append(message[0])
				self.workers_data[message[0]] = 0
			if len(tmp_msg) > 1:
				data = tmp_msg[1]
			if len(tmp_msg) > 2:
				data2 = pickle.loads(tmp_msg[2])
				computed_tasks = data2['ran']
				worker_current_size = data2['qsize']
				self.workers_data[message[0]] = float(worker_current_size)
				if len(computed_tasks) > 0:
					# Tell the resource manager that this worker has finished these tasks.
					self.resource_manager.finished_tasks(message[0], computed_tasks)
					self.result_queue.put(computed_tasks)
			if type == 'task_empty':
				self.workers_data[message[0]] = 0
			self.workers_lock.release()
			if type == 'task' or type == 'task_empty':
				''' send x tasks to it '''
				''' this is a hack; it works only with one client '''
				self.task_queue_lock.acquire()
				task_queue_len = len(self.task_queue)
				self.task_queue_lock.release()
				self.pending_task_queue_lock.acquire()
				pending_task_queue_len = len(self.pending_queue_data)
				self.pending_task_queue_lock.release()
				if type == 'task_empty' and task_queue_len == 0 \
						and pending_task_queue_len == 0 \
						and len(self.waiting_clients) > 0:
					self.workers_lock.acquire()
					if not self.workers_empty_dict.get(
							message[0]) and not first_worker:
						self.workers_empty_dict[message[0]] = 1
					self.workers_lock.release()
				answer = {'queues': {}, 'tasks': []}
				wake_client = False
				self.task_queue_lock.acquire()
				task_queue_len = len(self.task_queue)
				sent_tasks = False
				if task_queue_len > 0:
					sent_tasks = True
					answer = self.process_worker_nonempty_queue(message[0], type, data)
				else:
					self.workers_lock.acquire()
					if pending_task_queue_len == 0 and len(
							self.workers_empty_dict) >= len(
						self.workers):
						wake_client = True
					self.workers_lock.release()
				self.task_queue_lock.release()
				if len(answer['tasks']) > 0:
					data = pickle.dumps(answer)
					self.server_thread.put_request_in_queue(
						[message[0], PROTOCOL_HEADERS['RSMNG'], 'task', data])
					# Inform the resource manager we have sent tasks.
					self.resource_manager.sent_tasks_to_worker(message[0], answer['tasks'])
				if wake_client:
					for client in self.waiting_clients:
						# calculate the cost
						self.resource_manager.calculate_cost()
						print "Sending wake up message to ", client
						self.server_thread.put_request_in_queue(
							[client, PROTOCOL_HEADERS['RSMNG'], 'done'])
					self.waiting_clients = []
					self.workers_empty = 0
					return
				if sent_tasks:
					return
			elif type == 'output':
				''' un-serialize the output and append it to a log '''
				output = pickle.loads(data)
				self.task_logger.info(output)

	def send_message(self, message):
		"""
		Convenience method to let the resoure manager send a message.
		:param message:
		:return:
		"""
		self.server_thread.put_request_in_queue(message)

	def run(self):
		self.server_thread.start()
		while self.running:
			try:
				''' compute the number of workers and data nodes for each
					scheduling period '''
				if self.running:
					time.sleep(config.WAITTIME)
			except KeyboardInterrupt:
				self.shutdown()
			except:
				traceback.print_exc(self.logger.path)
				try:
					time.sleep(config.WAITTIME)
				except KeyboardInterrupt:
					self.shutdown()
		print "Stopping communication thread...."
		sys.stdout.flush()
		self.logger.info("Stopping communication thread....")
		self.server_thread.stop()
		print "Joining communication thread...."
		sys.stdout.flush()
		self.logger.info("Joining communication thread....")
		self.server_thread.join()
		if config.START_MEMFS:
			print "Stopping MemFS"
			sys.stdout.flush()
			self.resource_manager.stop_memfs()
		print "DONE"
		self.logger.info("DONE")
		return

	def shutdown(self):
		print "Received signal to shutdown. "
		sys.stdout.flush()
		self.logger.info("Received signal to shutdown. Will wait for the end of the \
                    scheduling period")
		self.running = False
Exemple #9
0
class NodeScheduler(object):

    def __init__(self):
        self.identity = 'sched-' + socket.gethostbyname(socket.gethostname())
        self.sched_client_thread = ZmqConnectionThread(
            self.identity,
            zmq.DEALER,
            config.SCHEDULER+":" + str(config.ZMQ_SCHEDULER_PORT),
            self.callback)
        self.monitor_thread = NodeMonitor()
        self.running = True
        logfile = config.LOGDIR + "/local_scheduler.log"
        self.logger = WeaselLogger('local_scheduler', logfile)
        self.capacity = self.monitor_thread.capacity
        self.max_tasks_to_run = {}
        ''' the starting number of tasks is defined based on the slot size '''
        self.ntasks_to_ask = 1
        self.task_id = 1
        self.time_asked_first = time.time()
        self.time_from_last_ask = -1
        ''' this is to keep track of number of running tasks ? '''
        self.running_task = 0
        self.nran_tasks = []
        self.time_from_last_ask = time.time()
        self.queues_asked_for = []
        self.current_ntasks = 1
        self.has_new_task = False
        self.is_profiling = False
        self.first_task = False
        self.task_data = {}
        self.t_avg = {}
        self.task_data_lock = threading.Lock()
        self.running_task_lock = threading.Lock()
        self.average_utilization = {'cpu': 0.0, 'memory': 0.0, 'network': 0.0}
        self.average_task_exec_time = 0.0
        self.sleep_time = config.WAITTIME
        self.past_speed_changes = []
        # 'id': id, 'tpool': threadPool, 'rvector': resource_characteristics
        self.queue_data = {}
	self.task_time = 1
        self.queue_data_lock = threading.Lock()
        self.has_new_queue = False
        self.new_queues = []
        self.message_to_send = None
	''' this is to control how many tasks to run in parallel'''
        self.logger.info("NodeScheduler started...")
        self.nrunning_past_period = []

    def profile(self, nrunning):
        pass

    def change_work_queue(self, nrunning, nrunning_past, avg_time, avg_cpu):
        pass

    def run_task(self, arg):
	command_id = arg['id']
        command = arg['exec'] +' ' + arg['params']
	qid = arg['qid']
	myid = threading.current_thread().ident
        self.running_task_lock.acquire()
        self.running_task = self.running_task + 1
        self.running_task_lock.release()
        ''' this also marks that at least one task runs on the node ... '''
        ''' here I need to put it in the queue of tasks that the monitor will watch over '''
        memory_average = 0.0
        cpu_average = 0.0
        nreads = 0
        nwrites = 0
        nbytesread = 0
        nbyteswritten = 0
        time_intervals = 0
        start_time = time.time()
        proc = psutil.Popen(command, shell=True,
                            stdout=PIPE, stderr=PIPE)
	self.task_data[myid]['lock'].acquire()
	self.task_data[myid]['proc'] = proc
	self.task_data[myid]['ctask'] = arg
	self.task_data[myid]['lock'].release()
        out, err = proc.communicate()
        end_time = time.time()
        self.task_data[myid]['lock'].acquire()
        if self.task_data[myid]['task'].get(qid) == None:
            self.task_data[myid]['task'][qid] = []
            self.external_change = True
        self.task_data[myid]['task'][qid].append(
            [end_time - start_time, 100 * (end_time - start_time) / (end_time - start_time)])
        self.task_data[myid]['lock'].release()
        self.running_task_lock.acquire()
        self.running_task = self.running_task - 1
        self.nran_tasks.append(command_id)
        self.running_task_lock.release()

    def get_total_queue_size(self):
        queue_size = 0
        self.queue_data_lock.acquire()
        for qid in self.queue_data:
	    #print "Queue ", qid, " size ", self.queue_data[qid]['tpool'].tasks.qsize()
            queue_size = queue_size + \
                self.queue_data[qid]['tpool'].tasks.qsize()
        self.queue_data_lock.release()
        return queue_size

    def get_tasks_to_ask(self, nrunning):
        tasks_to_ask = {}
        self.queues_asked_for = []
        queue_size = self.get_total_queue_size()
        if queue_size + nrunning == 0 and not self.is_profiling:
            return (tasks_to_ask, queue_size)
        self.queue_data_lock.acquire()
        for qid in self.queue_data:
	    tasks_to_ask[qid] = 0
            self.queue_data[qid]['asked'] = 0
            self.queue_data[qid]['recv'] = 0
            qsize = self.queue_data[qid]['tpool'].tasks.qsize()
            if qsize > 2 * self.max_tasks_to_run[qid] and self.max_tasks_to_run[qid] != -1:
                continue
            if qsize == 0:
                tasks_to_ask[qid] = max(10, 2 * self.max_tasks_to_run[qid])
            else:
                if qsize > 2 * self.max_tasks_to_run[qid] and self.max_tasks_to_run[qid] != -1:
                    continue
                elif qsize < 2 * self.max_tasks_to_run[qid]:
                    tasks_to_ask[qid] = 2
            self.queues_asked_for.append(qid)
            self.queue_data[qid]['asked'] = tasks_to_ask[qid]
        self.queue_data_lock.release()
        return (tasks_to_ask, queue_size)

    def wait_and_ask(self):
        while self.running:
            # check at 0.2 seconds
            time.sleep(0.2)
            # how much time is passed from the last time we asked the rmng
            ctime = time.time()
            if ctime - self.time_from_last_ask > 2 * config.WAITTIME:
                # here we mark the queues as dead
                for qid in self.queues_asked_for:
		    if self.queue_data[qid]['tpool'].tasks.qsize() == 0:
                        print "@@@@@@@@@@@@@@@@@@@  I mark queue ", qid, " as dead because I don't have tasks for it"
                        self.max_tasks_to_run[qid] = -1
            self.running_task_lock.acquire()
            nrunning = self.running_task
            self.nrunning_past_period.append(nrunning)
            task_data_to_send = {'ran': self.nran_tasks[:]}
            self.nran_tasks = []
            self.running_task_lock.release()
            resources = {'cpu': 0, 'memory': 0, 'network': 0}
            if self.is_profiling:
                self.profile(nrunning)
            (tasks_to_ask, queue_size) = self.get_tasks_to_ask(nrunning)
	    #print "Asking for tasks: ", tasks_to_ask, queue_size
	    task_data_to_send['qsize'] = queue_size * self.task_time
            pickled_data = pickle.dumps(task_data_to_send)
	    if self.is_profiling and config.POLICY == 'dynamic3':
                if qsize + nrunning == 0 and not self.first_task:
                    self.sched_client_thread.put_request_in_queue(
                        [self.identity, PROTOCOL_HEADERS['WORKER'], 'task_empty',
                         str(2 * self.current_ntasks), pickled_data])
                    self.first_task = True
                continue
            elif len(tasks_to_ask) > 0:
                self.sched_client_thread.put_request_in_queue(
                    [self.identity, PROTOCOL_HEADERS['WORKER'], 'task',
                     pickle.dumps(tasks_to_ask), pickled_data])
	    self.message_to_send = pickled_data

    def process_task(self, task):
        tmp = task.split(';')
        task_name = tmp[-1].split()[0].split('/')[-1]
        new_task = False
        # I have new tasks!!
        if task_name not in self.monitor_thread.tasks_to_monitor:
            new_task = True
            self.is_profiling = True
            self.monitor_thread.add_task_to_monitor(task_name)
        return new_task

    def add_task_to_queues(self, tasks):
        if len(tasks['queues']) > 0:
	    print tasks['queues']
            self.queue_data_lock.acquire()
            for queue in tasks['queues']:
                if not self.queue_data.get(queue):
                    self.new_queues.append(queue)
                    self.max_tasks_to_run[queue] = 0
                    self.queue_data[queue] = {
                        'qid': queue,
			'elapsed':0,
			'tavg': 0,
			'thoughput':0,
                        'asked': 0,
                        'recv': 0,
                        'type': "",
			'tpool': ThreadPool(
                            0,
                            self.task_data),
                        'resource': ""}  # this contains the resource vector of a task
                    self.has_new_queue = True
                if tasks['queues'][queue] == -1:
                    print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Queue ", queue, " is empty!"
                    self.max_tasks_to_run[queue] = -1
            self.queue_data_lock.release()
        self.queue_data_lock.acquire()
        for task in tasks['tasks']:
	    #print "Adding tasks to queues: ", task
            ''' here: if the task does not exist in my history:
                shrink the pool at 1 task and enter the profiling mode
                profiling mode = record resource util for the first 10 tasks '''
            qid = hashlib.sha1(task['exec'].encode()).hexdigest()
            self.has_new_task = self.has_new_task | self.process_task(
                task['exec'])
	    task['qid'] = qid
            self.add_task_to_queue(self.queue_data[qid]['tpool'], task)
            self.queue_data[qid]['recv'] = self.queue_data[qid]['recv'] + 1
        self.queue_data_lock.release()

    def callback(self, frames):
        ''' this is a message from the server '''
	command = frames[2]
        data = None
        if len(frames) > 3:
            data = frames[3]
        if command == 'shutdown':
            self.shutdown(None)
        elif command == 'task':
            self.time_from_last_ask = time.time()
            tasks = pickle.loads(data)
            self.add_task_to_queues(tasks)
	elif command == 'empty':
            for qid in self.queue_data:
                self.empty_queue(self.queue_data[qid]['tpool'])
	else:
	    print "No callback for this message!"


    def add_task_to_queue(self, queue, task):
        queue.add_task(self.run_task, task)

    def empty_queue(self, queue):
        while not queue.empty():
            try:
                queue.get(False)
            except Empty:
                continue
            queue.task_done()

    def shutdown(self, data):
        self.running = False

    def log_node_utilization(self):
        median = self.monitor_thread.get_median_utilization()
        histo_util = self.monitor_thread.get_utilization_by_histogram()
        data = self.monitor_thread.get_data()
        cpu_sum = median['cpu'] + median['cpu_idle'] + \
            median['cpu_sys'] + median['cpu_io']
        if cpu_sum == 0:
            real_value = 0
        else:
            real_value = 100 * (median['cpu']) / cpu_sum
        self.logger.info(
            "Median utilization/2secs: %s %s %s %s" %
            (median['cpu'],
             median['memory'],
                median['network'],
                100 *
                median['cpu_io'] /
                cpu_sum))
        self.logger.info(
            "Histo utilization: %s %s %s" %
            (histo_util['cpu'],
             histo_util['memory'],
             histo_util['network']))

    def is_ok_to_ask(self):
        return True

    def empty_task_data(self):
        for tid in self.task_data:
            self.task_data[tid]['lock'].acquire()
            self.task_data[tid]['task'] = {}
            self.task_data[tid]['lock'].release()

    def check_empty_queues(self):
        queues_empty = True
        self.queue_data_lock.acquire()
        for qid in self.queue_data:
            queues_empty = queues_empty & self.queue_data[
                qid]['tpool'].tasks.empty()
        self.queue_data_lock.release()
        return queues_empty


    def compute_stats(self, task_data, avg_time, avg_cpu):
        total_len = 0
        try:
            for tid in task_data:
                task_data[tid]['lock'].acquire()
                for task in task_data[tid]['task']:
                    if not avg_time.get(task):
                        avg_time[task] = 0
                        avg_cpu[task] = 0
                    for data in task_data[tid]['task'][task]:
                        avg_time[task] = avg_time[task] + data[0]
                        avg_cpu[task] = avg_cpu[task] + data[1]
                    total_len = total_len + \
                            len(task_data[tid]['task'][task])
                    if not self.is_profiling:
                        # leave the last value
                        while len(task_data[tid]['task'][task]) > 0:
                            task_data[tid]['task'][task].pop(0)
                task_data[tid]['lock'].release()
            for task in avg_time:
                avg_time[task] = avg_time[task] / total_len
                avg_cpu[task] = avg_cpu[task] / total_len
	    self.empty_task_data()
        except:
            traceback.print_exc()

    def run(self):
        self.sched_client_thread.start()
        self.monitor_thread.start()
        finishing_tasks_thread = Thread(target=self.wait_and_ask)
        finishing_tasks_thread.start()
        '''  I have: - the monitoring thread
        - the communication thread
        - the thread that waits to ask for more tasks
        '''
        while self.running:
            ''' if queue is empty and no other tasks are running: ask for task to the scheduler '''
            ''' else if tasks are running check the utilization and ask for more/less '''
            self.log_node_utilization()
            task_data = self.monitor_thread.get_task_data()
            total_util = {'cpu': 0, 'memory': 0}
            for task in task_data:
                for data in task_data[task]:
                    total_util['cpu'] = total_util[
                        'cpu'] + data[0][0] / len(task_data[task])
                    total_util['memory'] = total_util[
                        'memory'] + data[1] / len(task_data[task])
            self.logger.info(
                "Total utilization of the other processes is: %s %s" %
                (total_util['cpu'], total_util['memory']))
            # count the total number of slots
            self.running_task_lock.acquire()
            nrunning = self.running_task
            nrunning_past = self.nrunning_past_period[:]
            self.nrunning_past_period = []
            self.running_task_lock.release()
            for task in self.max_tasks_to_run:
                self.logger.info(
                    "%s Running tasks: %s" %
                    (task, self.max_tasks_to_run[task]))
            if self.check_empty_queues() and nrunning == 0:
                if self.is_ok_to_ask():
                    ''' I have finished all my tasks, ask for random task from the resource mng '''
		    print "Sending task_empty message!"
                    self.sched_client_thread.put_request_in_queue(
                            [self.identity, PROTOCOL_HEADERS['WORKER'], 'task_empty', str(2 * self.capacity['cores'])])
            avg_time = {}
            avg_cpu = {}
            task_data = self.task_data
            self.compute_stats(task_data, avg_time, avg_cpu)
            # now is the time to remove the data from the dead threads
            self.queue_data_lock.acquire()
            for qid in self.queue_data:
                self.queue_data[qid]['tpool'].dict_lock.acquire()
                for tid in self.queue_data[qid]['tpool'].deleted_workers:
                    if self.task_data.get(tid):
                        del self.task_data[tid]
                self.queue_data[qid]['tpool'].deleted_workers = []
                self.queue_data[qid]['tpool'].dict_lock.release()
            taskid = 0
	    max_task_time = 0
            for task in avg_time:
		if config.POLICY != 'static':
                    if avg_time[task] == 0:
		        self.queue_data[task]['elapsed'] = self.queue_data[task]['elapsed'] + config.WAITTIME
		    else:
		        self.queue_data[task]['elapsed'] = 0
		        self.queue_data[task]['tavg'] = (self.queue_data[task]['tavg'] + avg_time[task])/2
		if avg_time[task] > max_task_time:
                    max_task_time = avg_time[task] 
		print task, "Avg_time: ", avg_time[task]
                self.logger.info(
                        "%s Avg_time: %s" %
                        (task, avg_time[task]))
                self.logger.info(
                        "%s Task speed: %s" %
                        (task, self.max_tasks_to_run[task] / avg_time[task]))
                self.logger.info(
                        "%s Task util: %s" %
                        (task, avg_cpu[task]))
                self.past_speed_changes.append(
                        self.max_tasks_to_run[task] /
                        avg_time[task])
                if len(self.past_speed_changes) > 4:
                    self.past_speed_changes.pop(0)
                taskid = taskid + 1
	    
	        if config.POLICY != 'static' and self.queue_data[task]['type'] == "":
		    if self.queue_data[task]['elapsed'] > config.T_LONG or \
				self.queue_data[task]['tavg'] > config.T_LONG:
		        self.queue_data[task]['type'] = 'long'
		    if self.queue_data[task]['tavg'] < config.T_LONG:
		        self.queue_data[task]['type'] = 'short'
	    self.queue_data_lock.release()
            max_avg_time = 0
	    self.task_time = max_task_time
            self.change_work_queue(nrunning, nrunning_past, avg_time, avg_cpu)
            self.logger.info("Ran %s tasks" % self.task_id)
	    self.logger.debug("Sleeping: %s" % self.sleep_time)
            self.monitor_thread.max_data_buffer_len = int(
                self.sleep_time /
                config.MONITOR_PERIOD)
            time.sleep(self.sleep_time)
        finishing_tasks_thread.join()
        for qid in self.queue_data:
            self.queue_data[qid]['tpool'].wait_completion()
        self.sched_client_thread.stop()
        self.monitor_thread.shutdown()
        self.monitor_thread.join()
        self.sched_client_thread.join()